130 lines
5.3 KiB
Python
130 lines
5.3 KiB
Python
"""cs.money scrape worker (pull model).
|
|
|
|
A thin strategy over blworker.Worker: it supplies only the cs.money-specific bits — the
|
|
consent banner steps and how to scrape one skin+wear's sell-orders. The warm session, the
|
|
poll/scrape/post loop, the IPRoyal proxy and IP rotation, logging and shutdown all live in
|
|
the shared runtime. Env knobs are documented in worker/README.md.
|
|
|
|
cs.money is an Astro SSR app: the free-text market search filters server-side and the
|
|
resulting listings are embedded in the page as a __page-params JSON blob. The
|
|
/2.0/market/sell-orders API rejects a `search` param (HTTP 400), so we fetch the PAGE for
|
|
a search and read the embedded items — same item shape as the API.
|
|
|
|
A page returns at most 60 and offset is ignored, so we paginate with a FORWARD CURSOR on
|
|
float: cs.money honors `order=asc&sort=float` + `minFloat`, and float is full-precision and
|
|
effectively unique per item. We grab the 60 lowest-float items at/above `lo`, advance `lo`
|
|
to the highest float returned, and repeat until a page is under the cap. (The old
|
|
minPrice/maxPrice bisection silently truncated cheap skins: >60 listings can share a
|
|
sub-$0.02 reference band, which no price window can split — floats almost never tie, so the
|
|
cursor always makes progress.)
|
|
|
|
cd worker
|
|
.venv\\Scripts\\Activate.ps1
|
|
pip install -r requirements.txt
|
|
python csmoney_worker.py
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import urllib.parse
|
|
|
|
from blworker import ScrapeResult, Worker, click, page_fetch, run
|
|
|
|
PAGE = ("https://cs.money/market/buy/?search={search}"
|
|
"&order=asc&sort=float&minFloat={lo:.12f}&maxFloat=1")
|
|
PAGE_CAP = 60 # items per SSR page
|
|
PAGE_PARAMS_RE = re.compile(
|
|
r'<script\b[^>]*id="__page-params"[^>]*>(.*?)</script>', re.S)
|
|
|
|
|
|
def extract_items(html: str) -> list:
|
|
"""Pull inventory.items out of the page's __page-params JSON blob."""
|
|
m = PAGE_PARAMS_RE.search(html)
|
|
if not m:
|
|
return []
|
|
try:
|
|
return json.loads(m.group(1)).get("inventory", {}).get("items", []) or []
|
|
except json.JSONDecodeError:
|
|
return []
|
|
|
|
|
|
class CsMoneyWorker(Worker):
|
|
name = "csmoney"
|
|
jobs_path = "/jobs"
|
|
default_market_url = "https://cs.money/market/buy/"
|
|
|
|
def describe_job(self, job) -> str:
|
|
return f"search {job['search']!r}"
|
|
|
|
async def dismiss_consent(self, page) -> str | None:
|
|
"""Privacy-preserving. The banner only offers 'Accept all' / 'Manage cookies';
|
|
the Reject-all control lives inside the Manage window. So: Manage -> Reject all ->
|
|
Confirm. (The data path reads SSR __page-params regardless, but this keeps the
|
|
session honest and unblocks any future interaction.)"""
|
|
steps = []
|
|
if await click(page, "Manage cookies") or await click(page, "Manage"):
|
|
await page.sleep(1)
|
|
if await click(page, "Reject all"):
|
|
steps.append("reject-all")
|
|
for c in ("Confirm my choice", "Confirm", "Save"):
|
|
if await click(page, c):
|
|
steps.append(f"confirm:{c}")
|
|
break
|
|
return ", ".join(steps) if steps else None
|
|
|
|
async def scrape_job(self, page, job) -> ScrapeResult:
|
|
"""Scrape ALL listings for one skin+wear via a forward float cursor.
|
|
|
|
Grab the 60 lowest-float items at/above `lo`, advance `lo` to the highest float on
|
|
the page, repeat until a page is under the cap. The boundary item is re-fetched
|
|
(minFloat is inclusive) and dropped by the id dedup."""
|
|
search = urllib.parse.quote_plus(job["search"])
|
|
max_fetches = job.get("maxPages", 40) # safety cap on page fetches per job
|
|
seen: dict = {}
|
|
fetches = 0
|
|
wire = 0
|
|
lo = 0.0
|
|
reason = "completed"
|
|
|
|
while fetches < max_fetches:
|
|
_status, body, wbytes = await page_fetch(page, PAGE.format(search=search, lo=lo))
|
|
fetches += 1
|
|
if wbytes > 0:
|
|
wire += wbytes
|
|
|
|
if "Just a moment" in body or "challenge-platform" in body:
|
|
return ScrapeResult(list(seen.values()), fetches, "challenged", wire)
|
|
|
|
items = extract_items(body)
|
|
floats = []
|
|
for it in items:
|
|
if it.get("id") is not None:
|
|
seen[it["id"]] = it
|
|
fl = (it.get("asset") or {}).get("float")
|
|
if isinstance(fl, (int, float)):
|
|
floats.append(fl)
|
|
|
|
if len(items) < PAGE_CAP:
|
|
break # last page — fewer than the cap means we've seen everything
|
|
|
|
# Advance the cursor past the highest float on this page. Items at exactly that
|
|
# float are re-fetched next round (minFloat is inclusive) and deduped by id.
|
|
nxt = max(floats) if floats else None
|
|
if nxt is None or nxt <= lo:
|
|
# Cursor can't advance: >60 listings share a single float value, or the
|
|
# items carry no float. Bail loudly rather than spin — a flagged gap beats
|
|
# a silent one (this is the failure the price-window version hid).
|
|
reason = "stuck-float-tie"
|
|
break
|
|
lo = nxt
|
|
|
|
await self._pace(page)
|
|
else:
|
|
reason = "fetch-cap"
|
|
|
|
return ScrapeResult(list(seen.values()), fetches, reason, wire)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
run(CsMoneyWorker)
|