almost ready
This commit is contained in:
174
worker/skinland_worker.py
Normal file
174
worker/skinland_worker.py
Normal file
@@ -0,0 +1,174 @@
|
||||
"""skin.land scrape worker (pull model).
|
||||
|
||||
A thin strategy over blworker.Worker, mirroring the cs.money worker — it supplies only the
|
||||
skin.land-specific bits; the warm session, poll/scrape/post loop, IPRoyal proxy, IP
|
||||
rotation, logging and shutdown all live in the shared runtime. Env knobs: worker/README.md.
|
||||
|
||||
How skin.land is scraped (learned from the discovery probes):
|
||||
- A job's target is the market PAGE URL, e.g.
|
||||
https://skin.land/market/csgo/ak-47-redline-field-tested/
|
||||
- That Nuxt page embeds an internal numeric skin_id. We resolve it once from the page's
|
||||
__NUXT__ payload (the skin object whose `url` == the page slug), cache it per slug, then
|
||||
page the clean JSON API:
|
||||
GET https://app.skin.land/api/v2/obtained-skins?skin_id={id}&page={n}
|
||||
which returns a Laravel paginator {data:[...offers], meta:{current_page,last_page,…}}.
|
||||
- We walk pages 1..last_page (capped by the job's maxPages), dedup offers by id, and post.
|
||||
|
||||
cd worker
|
||||
.venv\\Scripts\\Activate.ps1
|
||||
pip install -r requirements.txt
|
||||
python skinland_worker.py
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
|
||||
from blworker import ScrapeResult, Worker, click, looks_like_challenge, page_fetch, run
|
||||
|
||||
# The offers API. skin_id is skin.land's internal id (resolved from the page); page is the
|
||||
# Laravel paginator page. Same warm session, fetched in-page (CORS-allowed app subdomain).
|
||||
API = "https://app.skin.land/api/v2/obtained-skins?skin_id={skin_id}&page={page}"
|
||||
|
||||
# The page's Nuxt payload is a devalue flat array; the main skin object is the one whose
|
||||
# `url` field resolves to the page slug, and its `id` field resolves to the skin_id.
|
||||
NUXT_ARRAY_RE = re.compile(r'\[\["(?:ShallowReactive|Reactive)",\d+\]')
|
||||
|
||||
|
||||
def slug_of(url: str) -> str:
|
||||
return url.rstrip("/").rsplit("/", 1)[-1]
|
||||
|
||||
|
||||
def extract_nuxt_array(html: str):
|
||||
"""Pull the Nuxt devalue payload (a JSON flat array of values with index references)
|
||||
out of the page HTML. Returns the parsed list, or None."""
|
||||
m = NUXT_ARRAY_RE.search(html)
|
||||
if not m:
|
||||
return None
|
||||
start = m.start()
|
||||
depth = 0
|
||||
instr = False
|
||||
esc = False
|
||||
for i in range(start, len(html)):
|
||||
ch = html[i]
|
||||
if esc:
|
||||
esc = False
|
||||
continue
|
||||
if ch == "\\":
|
||||
esc = True
|
||||
continue
|
||||
if ch == '"':
|
||||
instr = not instr
|
||||
continue
|
||||
if instr:
|
||||
continue
|
||||
if ch == "[":
|
||||
depth += 1
|
||||
elif ch == "]":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
try:
|
||||
return json.loads(html[start:i + 1])
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def resolve_skin_id(html: str, slug: str) -> int | None:
|
||||
"""Find the page's main skin object in the Nuxt payload — the dict whose `url` field
|
||||
resolves to the page slug — and return its resolved `id` (skin.land's internal skin_id
|
||||
used by the obtained-skins API)."""
|
||||
arr = extract_nuxt_array(html)
|
||||
if not arr:
|
||||
return None
|
||||
|
||||
def val(ref):
|
||||
return arr[ref] if isinstance(ref, int) and 0 <= ref < len(arr) else ref
|
||||
|
||||
for el in arr:
|
||||
if isinstance(el, dict) and "url" in el and "id" in el and val(el["url"]) == slug:
|
||||
sid = val(el["id"])
|
||||
if isinstance(sid, int):
|
||||
return sid
|
||||
return None
|
||||
|
||||
|
||||
class SkinLandWorker(Worker):
|
||||
name = "skinland"
|
||||
jobs_path = "/skinland/jobs"
|
||||
default_market_url = "https://skin.land/market/csgo/"
|
||||
|
||||
def __init__(self, settings):
|
||||
super().__init__(settings)
|
||||
# skin_id is stable per skin+wear, so cache it per slug to skip the ~page fetch on
|
||||
# re-sweeps.
|
||||
self._skin_id_cache: dict[str, int] = {}
|
||||
|
||||
def describe_job(self, job) -> str:
|
||||
return slug_of(job["url"])
|
||||
|
||||
async def dismiss_consent(self, page) -> str | None:
|
||||
"""Privacy-preserving: dismiss the cookie banner with essential-only if present."""
|
||||
for label in ("Accept essential", "ACCEPT ESSENTIAL", "Reject all"):
|
||||
if await click(page, label):
|
||||
return f"dismissed via {label!r}"
|
||||
return None
|
||||
|
||||
async def _get_skin_id(self, page, job, slug: str) -> tuple[int | None, str, int]:
|
||||
"""Resolve (and cache) skin.land's skin_id for this slug. Returns
|
||||
(skin_id, reason, wire); reason is "" on success, else a partial-stop reason."""
|
||||
if slug in self._skin_id_cache:
|
||||
return self._skin_id_cache[slug], "", 0
|
||||
|
||||
_status, html, wire = await page_fetch(page, job["url"], accept="text/html")
|
||||
if looks_like_challenge(html):
|
||||
return None, "challenged", max(wire, 0)
|
||||
skin_id = resolve_skin_id(html, slug)
|
||||
if skin_id is None:
|
||||
return None, "no-skin-id", max(wire, 0)
|
||||
self._skin_id_cache[slug] = skin_id
|
||||
return skin_id, "", max(wire, 0)
|
||||
|
||||
async def scrape_job(self, page, job) -> ScrapeResult:
|
||||
"""Scrape ALL offers for one skin+wear by paging the obtained-skins API."""
|
||||
slug = slug_of(job["url"])
|
||||
max_pages = job.get("maxPages", 40)
|
||||
|
||||
skin_id, reason, wire = await self._get_skin_id(page, job, slug)
|
||||
if skin_id is None:
|
||||
return ScrapeResult([], 0, reason, wire)
|
||||
|
||||
seen: dict = {}
|
||||
fetches = 0
|
||||
page_n = 1
|
||||
reason = "completed"
|
||||
while page_n <= max_pages:
|
||||
_status, body, wbytes = await page_fetch(page, API.format(skin_id=skin_id, page=page_n))
|
||||
fetches += 1
|
||||
if wbytes > 0:
|
||||
wire += wbytes
|
||||
|
||||
if looks_like_challenge(body):
|
||||
return ScrapeResult(list(seen.values()), fetches, "challenged", wire)
|
||||
try:
|
||||
payload = json.loads(body)
|
||||
except json.JSONDecodeError:
|
||||
return ScrapeResult(list(seen.values()), fetches, "bad-json", wire)
|
||||
|
||||
for o in payload.get("data") or []:
|
||||
if o.get("id") is not None:
|
||||
seen[o["id"]] = o
|
||||
|
||||
meta = payload.get("meta") or {}
|
||||
last = meta.get("last_page")
|
||||
if not payload.get("data") or (isinstance(last, int) and page_n >= last):
|
||||
break # walked the final page
|
||||
page_n += 1
|
||||
await self._pace(page)
|
||||
else:
|
||||
reason = "fetch-cap"
|
||||
|
||||
return ScrapeResult(list(seen.values()), fetches, reason, wire)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run(SkinLandWorker)
|
||||
Reference in New Issue
Block a user