almost ready

2026-06-01 10:52:06 -05:00
parent 8b0eb0db78
commit 763305ca89
94 changed files with 8766 additions and 2674 deletions
--- a/worker/skinland_worker.py
+++ b/worker/skinland_worker.py
@@ -0,0 +1,174 @@
+"""skin.land scrape worker (pull model).
+
+A thin strategy over blworker.Worker, mirroring the cs.money worker — it supplies only the
+skin.land-specific bits; the warm session, poll/scrape/post loop, IPRoyal proxy, IP
+rotation, logging and shutdown all live in the shared runtime. Env knobs: worker/README.md.
+
+How skin.land is scraped (learned from the discovery probes):
+  - A job's target is the market PAGE URL, e.g.
+    https://skin.land/market/csgo/ak-47-redline-field-tested/
+  - That Nuxt page embeds an internal numeric skin_id. We resolve it once from the page's
+    __NUXT__ payload (the skin object whose `url` == the page slug), cache it per slug, then
+    page the clean JSON API:
+        GET https://app.skin.land/api/v2/obtained-skins?skin_id={id}&page={n}
+    which returns a Laravel paginator {data:[...offers], meta:{current_page,last_page,…}}.
+  - We walk pages 1..last_page (capped by the job's maxPages), dedup offers by id, and post.
+
+    cd worker
+    .venv\\Scripts\\Activate.ps1
+    pip install -r requirements.txt
+    python skinland_worker.py
+"""
+
+import json
+import re
+
+from blworker import ScrapeResult, Worker, click, looks_like_challenge, page_fetch, run
+
+# The offers API. skin_id is skin.land's internal id (resolved from the page); page is the
+# Laravel paginator page. Same warm session, fetched in-page (CORS-allowed app subdomain).
+API = "https://app.skin.land/api/v2/obtained-skins?skin_id={skin_id}&page={page}"
+
+# The page's Nuxt payload is a devalue flat array; the main skin object is the one whose
+# `url` field resolves to the page slug, and its `id` field resolves to the skin_id.
+NUXT_ARRAY_RE = re.compile(r'\[\["(?:ShallowReactive|Reactive)",\d+\]')
+
+
+def slug_of(url: str) -> str:
+    return url.rstrip("/").rsplit("/", 1)[-1]
+
+
+def extract_nuxt_array(html: str):
+    """Pull the Nuxt devalue payload (a JSON flat array of values with index references)
+    out of the page HTML. Returns the parsed list, or None."""
+    m = NUXT_ARRAY_RE.search(html)
+    if not m:
+        return None
+    start = m.start()
+    depth = 0
+    instr = False
+    esc = False
+    for i in range(start, len(html)):
+        ch = html[i]
+        if esc:
+            esc = False
+            continue
+        if ch == "\\":
+            esc = True
+            continue
+        if ch == '"':
+            instr = not instr
+            continue
+        if instr:
+            continue
+        if ch == "[":
+            depth += 1
+        elif ch == "]":
+            depth -= 1
+            if depth == 0:
+                try:
+                    return json.loads(html[start:i + 1])
+                except json.JSONDecodeError:
+                    return None
+    return None
+
+
+def resolve_skin_id(html: str, slug: str) -> int | None:
+    """Find the page's main skin object in the Nuxt payload — the dict whose `url` field
+    resolves to the page slug — and return its resolved `id` (skin.land's internal skin_id
+    used by the obtained-skins API)."""
+    arr = extract_nuxt_array(html)
+    if not arr:
+        return None
+
+    def val(ref):
+        return arr[ref] if isinstance(ref, int) and 0 <= ref < len(arr) else ref
+
+    for el in arr:
+        if isinstance(el, dict) and "url" in el and "id" in el and val(el["url"]) == slug:
+            sid = val(el["id"])
+            if isinstance(sid, int):
+                return sid
+    return None
+
+
+class SkinLandWorker(Worker):
+    name = "skinland"
+    jobs_path = "/skinland/jobs"
+    default_market_url = "https://skin.land/market/csgo/"
+
+    def __init__(self, settings):
+        super().__init__(settings)
+        # skin_id is stable per skin+wear, so cache it per slug to skip the ~page fetch on
+        # re-sweeps.
+        self._skin_id_cache: dict[str, int] = {}
+
+    def describe_job(self, job) -> str:
+        return slug_of(job["url"])
+
+    async def dismiss_consent(self, page) -> str | None:
+        """Privacy-preserving: dismiss the cookie banner with essential-only if present."""
+        for label in ("Accept essential", "ACCEPT ESSENTIAL", "Reject all"):
+            if await click(page, label):
+                return f"dismissed via {label!r}"
+        return None
+
+    async def _get_skin_id(self, page, job, slug: str) -> tuple[int | None, str, int]:
+        """Resolve (and cache) skin.land's skin_id for this slug. Returns
+        (skin_id, reason, wire); reason is "" on success, else a partial-stop reason."""
+        if slug in self._skin_id_cache:
+            return self._skin_id_cache[slug], "", 0
+
+        _status, html, wire = await page_fetch(page, job["url"], accept="text/html")
+        if looks_like_challenge(html):
+            return None, "challenged", max(wire, 0)
+        skin_id = resolve_skin_id(html, slug)
+        if skin_id is None:
+            return None, "no-skin-id", max(wire, 0)
+        self._skin_id_cache[slug] = skin_id
+        return skin_id, "", max(wire, 0)
+
+    async def scrape_job(self, page, job) -> ScrapeResult:
+        """Scrape ALL offers for one skin+wear by paging the obtained-skins API."""
+        slug = slug_of(job["url"])
+        max_pages = job.get("maxPages", 40)
+
+        skin_id, reason, wire = await self._get_skin_id(page, job, slug)
+        if skin_id is None:
+            return ScrapeResult([], 0, reason, wire)
+
+        seen: dict = {}
+        fetches = 0
+        page_n = 1
+        reason = "completed"
+        while page_n <= max_pages:
+            _status, body, wbytes = await page_fetch(page, API.format(skin_id=skin_id, page=page_n))
+            fetches += 1
+            if wbytes > 0:
+                wire += wbytes
+
+            if looks_like_challenge(body):
+                return ScrapeResult(list(seen.values()), fetches, "challenged", wire)
+            try:
+                payload = json.loads(body)
+            except json.JSONDecodeError:
+                return ScrapeResult(list(seen.values()), fetches, "bad-json", wire)
+
+            for o in payload.get("data") or []:
+                if o.get("id") is not None:
+                    seen[o["id"]] = o
+
+            meta = payload.get("meta") or {}
+            last = meta.get("last_page")
+            if not payload.get("data") or (isinstance(last, int) and page_n >= last):
+                break  # walked the final page
+            page_n += 1
+            await self._pace(page)
+        else:
+            reason = "fetch-cap"
+
+        return ScrapeResult(list(seen.values()), fetches, reason, wire)
+
+
+if __name__ == "__main__":
+    run(SkinLandWorker)