"""skin.land scrape worker (pull model). A thin strategy over blworker.Worker, mirroring the cs.money worker — it supplies only the skin.land-specific bits; the warm session, poll/scrape/post loop, IPRoyal proxy, IP rotation, logging and shutdown all live in the shared runtime. Env knobs: worker/README.md. How skin.land is scraped (learned from the discovery probes): - A job's target is the market PAGE URL, e.g. https://skin.land/market/csgo/ak-47-redline-field-tested/ - That Nuxt page embeds an internal numeric skin_id. We resolve it once from the page's __NUXT__ payload (the skin object whose `url` == the page slug), cache it per slug, then page the clean JSON API: GET https://app.skin.land/api/v2/obtained-skins?skin_id={id}&page={n} which returns a Laravel paginator {data:[...offers], meta:{current_page,last_page,…}}. - We walk pages 1..last_page (capped by the job's maxPages), dedup offers by id, and post. cd worker .venv\\Scripts\\Activate.ps1 pip install -r requirements.txt python skinland_worker.py """ import json import re from blworker import ScrapeResult, Worker, click, looks_like_challenge, page_fetch, run # The offers API. skin_id is skin.land's internal id (resolved from the page); page is the # Laravel paginator page. Same warm session, fetched in-page (CORS-allowed app subdomain). API = "https://app.skin.land/api/v2/obtained-skins?skin_id={skin_id}&page={page}" # The page's Nuxt payload is a devalue flat array; the main skin object is the one whose # `url` field resolves to the page slug, and its `id` field resolves to the skin_id. NUXT_ARRAY_RE = re.compile(r'\[\["(?:ShallowReactive|Reactive)",\d+\]') def slug_of(url: str) -> str: return url.rstrip("/").rsplit("/", 1)[-1] def extract_nuxt_array(html: str): """Pull the Nuxt devalue payload (a JSON flat array of values with index references) out of the page HTML. Returns the parsed list, or None.""" m = NUXT_ARRAY_RE.search(html) if not m: return None start = m.start() depth = 0 instr = False esc = False for i in range(start, len(html)): ch = html[i] if esc: esc = False continue if ch == "\\": esc = True continue if ch == '"': instr = not instr continue if instr: continue if ch == "[": depth += 1 elif ch == "]": depth -= 1 if depth == 0: try: return json.loads(html[start:i + 1]) except json.JSONDecodeError: return None return None def resolve_skin_id(html: str, slug: str) -> int | None: """Find the page's main skin object in the Nuxt payload — the dict whose `url` field resolves to the page slug — and return its resolved `id` (skin.land's internal skin_id used by the obtained-skins API).""" arr = extract_nuxt_array(html) if not arr: return None def val(ref): return arr[ref] if isinstance(ref, int) and 0 <= ref < len(arr) else ref for el in arr: if isinstance(el, dict) and "url" in el and "id" in el and val(el["url"]) == slug: sid = val(el["id"]) if isinstance(sid, int): return sid return None class SkinLandWorker(Worker): name = "skinland" jobs_path = "/skinland/jobs" default_market_url = "https://skin.land/market/csgo/" def __init__(self, settings): super().__init__(settings) # skin_id is stable per skin+wear, so cache it per slug to skip the ~page fetch on # re-sweeps. self._skin_id_cache: dict[str, int] = {} def describe_job(self, job) -> str: return slug_of(job["url"]) async def dismiss_consent(self, page) -> str | None: """Privacy-preserving: dismiss the cookie banner with essential-only if present.""" for label in ("Accept essential", "ACCEPT ESSENTIAL", "Reject all"): if await click(page, label): return f"dismissed via {label!r}" return None async def _get_skin_id(self, page, job, slug: str) -> tuple[int | None, str, int]: """Resolve (and cache) skin.land's skin_id for this slug. Returns (skin_id, reason, wire); reason is "" on success, else a partial-stop reason.""" if slug in self._skin_id_cache: return self._skin_id_cache[slug], "", 0 _status, html, wire = await page_fetch(page, job["url"], accept="text/html") if looks_like_challenge(html): return None, "challenged", max(wire, 0) skin_id = resolve_skin_id(html, slug) if skin_id is None: return None, "no-skin-id", max(wire, 0) self._skin_id_cache[slug] = skin_id return skin_id, "", max(wire, 0) async def scrape_job(self, page, job) -> ScrapeResult: """Scrape ALL offers for one skin+wear by paging the obtained-skins API.""" slug = slug_of(job["url"]) max_pages = job.get("maxPages", 40) skin_id, reason, wire = await self._get_skin_id(page, job, slug) if skin_id is None: return ScrapeResult([], 0, reason, wire) seen: dict = {} fetches = 0 page_n = 1 reason = "completed" while page_n <= max_pages: _status, body, wbytes = await page_fetch(page, API.format(skin_id=skin_id, page=page_n)) fetches += 1 if wbytes > 0: wire += wbytes if looks_like_challenge(body): return ScrapeResult(list(seen.values()), fetches, "challenged", wire) try: payload = json.loads(body) except json.JSONDecodeError: return ScrapeResult(list(seen.values()), fetches, "bad-json", wire) for o in payload.get("data") or []: if o.get("id") is not None: seen[o["id"]] = o meta = payload.get("meta") or {} last = meta.get("last_page") if not payload.get("data") or (isinstance(last, int) and page_n >= last): break # walked the final page page_n += 1 await self._pace(page) else: reason = "fetch-cap" return ScrapeResult(list(seen.values()), fetches, reason, wire) if __name__ == "__main__": run(SkinLandWorker)