almost ready
This commit is contained in:
129
worker/csmoney_worker.py
Normal file
129
worker/csmoney_worker.py
Normal file
@@ -0,0 +1,129 @@
|
||||
"""cs.money scrape worker (pull model).
|
||||
|
||||
A thin strategy over blworker.Worker: it supplies only the cs.money-specific bits — the
|
||||
consent banner steps and how to scrape one skin+wear's sell-orders. The warm session, the
|
||||
poll/scrape/post loop, the IPRoyal proxy and IP rotation, logging and shutdown all live in
|
||||
the shared runtime. Env knobs are documented in worker/README.md.
|
||||
|
||||
cs.money is an Astro SSR app: the free-text market search filters server-side and the
|
||||
resulting listings are embedded in the page as a __page-params JSON blob. The
|
||||
/2.0/market/sell-orders API rejects a `search` param (HTTP 400), so we fetch the PAGE for
|
||||
a search and read the embedded items — same item shape as the API.
|
||||
|
||||
A page returns at most 60 and offset is ignored, so we paginate with a FORWARD CURSOR on
|
||||
float: cs.money honors `order=asc&sort=float` + `minFloat`, and float is full-precision and
|
||||
effectively unique per item. We grab the 60 lowest-float items at/above `lo`, advance `lo`
|
||||
to the highest float returned, and repeat until a page is under the cap. (The old
|
||||
minPrice/maxPrice bisection silently truncated cheap skins: >60 listings can share a
|
||||
sub-$0.02 reference band, which no price window can split — floats almost never tie, so the
|
||||
cursor always makes progress.)
|
||||
|
||||
cd worker
|
||||
.venv\\Scripts\\Activate.ps1
|
||||
pip install -r requirements.txt
|
||||
python csmoney_worker.py
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import urllib.parse
|
||||
|
||||
from blworker import ScrapeResult, Worker, click, page_fetch, run
|
||||
|
||||
PAGE = ("https://cs.money/market/buy/?search={search}"
|
||||
"&order=asc&sort=float&minFloat={lo:.12f}&maxFloat=1")
|
||||
PAGE_CAP = 60 # items per SSR page
|
||||
PAGE_PARAMS_RE = re.compile(
|
||||
r'<script\b[^>]*id="__page-params"[^>]*>(.*?)</script>', re.S)
|
||||
|
||||
|
||||
def extract_items(html: str) -> list:
|
||||
"""Pull inventory.items out of the page's __page-params JSON blob."""
|
||||
m = PAGE_PARAMS_RE.search(html)
|
||||
if not m:
|
||||
return []
|
||||
try:
|
||||
return json.loads(m.group(1)).get("inventory", {}).get("items", []) or []
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
|
||||
|
||||
class CsMoneyWorker(Worker):
|
||||
name = "csmoney"
|
||||
jobs_path = "/jobs"
|
||||
default_market_url = "https://cs.money/market/buy/"
|
||||
|
||||
def describe_job(self, job) -> str:
|
||||
return f"search {job['search']!r}"
|
||||
|
||||
async def dismiss_consent(self, page) -> str | None:
|
||||
"""Privacy-preserving. The banner only offers 'Accept all' / 'Manage cookies';
|
||||
the Reject-all control lives inside the Manage window. So: Manage -> Reject all ->
|
||||
Confirm. (The data path reads SSR __page-params regardless, but this keeps the
|
||||
session honest and unblocks any future interaction.)"""
|
||||
steps = []
|
||||
if await click(page, "Manage cookies") or await click(page, "Manage"):
|
||||
await page.sleep(1)
|
||||
if await click(page, "Reject all"):
|
||||
steps.append("reject-all")
|
||||
for c in ("Confirm my choice", "Confirm", "Save"):
|
||||
if await click(page, c):
|
||||
steps.append(f"confirm:{c}")
|
||||
break
|
||||
return ", ".join(steps) if steps else None
|
||||
|
||||
async def scrape_job(self, page, job) -> ScrapeResult:
|
||||
"""Scrape ALL listings for one skin+wear via a forward float cursor.
|
||||
|
||||
Grab the 60 lowest-float items at/above `lo`, advance `lo` to the highest float on
|
||||
the page, repeat until a page is under the cap. The boundary item is re-fetched
|
||||
(minFloat is inclusive) and dropped by the id dedup."""
|
||||
search = urllib.parse.quote_plus(job["search"])
|
||||
max_fetches = job.get("maxPages", 40) # safety cap on page fetches per job
|
||||
seen: dict = {}
|
||||
fetches = 0
|
||||
wire = 0
|
||||
lo = 0.0
|
||||
reason = "completed"
|
||||
|
||||
while fetches < max_fetches:
|
||||
_status, body, wbytes = await page_fetch(page, PAGE.format(search=search, lo=lo))
|
||||
fetches += 1
|
||||
if wbytes > 0:
|
||||
wire += wbytes
|
||||
|
||||
if "Just a moment" in body or "challenge-platform" in body:
|
||||
return ScrapeResult(list(seen.values()), fetches, "challenged", wire)
|
||||
|
||||
items = extract_items(body)
|
||||
floats = []
|
||||
for it in items:
|
||||
if it.get("id") is not None:
|
||||
seen[it["id"]] = it
|
||||
fl = (it.get("asset") or {}).get("float")
|
||||
if isinstance(fl, (int, float)):
|
||||
floats.append(fl)
|
||||
|
||||
if len(items) < PAGE_CAP:
|
||||
break # last page — fewer than the cap means we've seen everything
|
||||
|
||||
# Advance the cursor past the highest float on this page. Items at exactly that
|
||||
# float are re-fetched next round (minFloat is inclusive) and deduped by id.
|
||||
nxt = max(floats) if floats else None
|
||||
if nxt is None or nxt <= lo:
|
||||
# Cursor can't advance: >60 listings share a single float value, or the
|
||||
# items carry no float. Bail loudly rather than spin — a flagged gap beats
|
||||
# a silent one (this is the failure the price-window version hid).
|
||||
reason = "stuck-float-tie"
|
||||
break
|
||||
lo = nxt
|
||||
|
||||
await self._pace(page)
|
||||
else:
|
||||
reason = "fetch-cap"
|
||||
|
||||
return ScrapeResult(list(seen.values()), fetches, reason, wire)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run(CsMoneyWorker)
|
||||
Reference in New Issue
Block a user