Operation-Blue-Laminate-v2/worker/csmoney_worker.py

"""cs.money scrape worker (pull model).

A thin strategy over blworker.Worker: it supplies only the cs.money-specific bits — the
consent banner steps and how to scrape one skin+wear's sell-orders. The warm session, the
poll/scrape/post loop, the IPRoyal proxy and IP rotation, logging and shutdown all live in
the shared runtime. Env knobs are documented in worker/README.md.

cs.money is an Astro SSR app: the free-text market search filters server-side and the
resulting listings are embedded in the page as a __page-params JSON blob. The
/2.0/market/sell-orders API rejects a `search` param (HTTP 400), so we fetch the PAGE for
a search and read the embedded items — same item shape as the API.

A page returns at most 60 and offset is ignored, so we paginate with a FORWARD CURSOR on
float: cs.money honors `order=asc&sort=float` + `minFloat`, and float is full-precision and
effectively unique per item. We grab the 60 lowest-float items at/above `lo`, advance `lo`
to the highest float returned, and repeat until a page is under the cap. (The old
minPrice/maxPrice bisection silently truncated cheap skins: >60 listings can share a
sub-$0.02 reference band, which no price window can split — floats almost never tie, so the
cursor always makes progress.)

    cd worker
    .venv\\Scripts\\Activate.ps1
    pip install -r requirements.txt
    python csmoney_worker.py
"""

import json
import re
import urllib.parse

from blworker import ScrapeResult, Worker, click, page_fetch, run

PAGE = ("https://cs.money/market/buy/?search={search}"
        "&order=asc&sort=float&minFloat={lo:.12f}&maxFloat=1")
PAGE_CAP = 60          # items per SSR page
PAGE_PARAMS_RE = re.compile(
    r'<script\b[^>]*id="__page-params"[^>]*>(.*?)</script>', re.S)


def extract_items(html: str) -> list:
    """Pull inventory.items out of the page's __page-params JSON blob."""
    m = PAGE_PARAMS_RE.search(html)
    if not m:
        return []
    try:
        return json.loads(m.group(1)).get("inventory", {}).get("items", []) or []
    except json.JSONDecodeError:
        return []


class CsMoneyWorker(Worker):
    name = "csmoney"
    jobs_path = "/jobs"
    default_market_url = "https://cs.money/market/buy/"

    def describe_job(self, job) -> str:
        return f"search {job['search']!r}"

    async def dismiss_consent(self, page) -> str | None:
        """Privacy-preserving. The banner only offers 'Accept all' / 'Manage cookies';
        the Reject-all control lives inside the Manage window. So: Manage -> Reject all ->
        Confirm. (The data path reads SSR __page-params regardless, but this keeps the
        session honest and unblocks any future interaction.)"""
        steps = []
        if await click(page, "Manage cookies") or await click(page, "Manage"):
            await page.sleep(1)
            if await click(page, "Reject all"):
                steps.append("reject-all")
            for c in ("Confirm my choice", "Confirm", "Save"):
                if await click(page, c):
                    steps.append(f"confirm:{c}")
                    break
        return ", ".join(steps) if steps else None

    async def scrape_job(self, page, job) -> ScrapeResult:
        """Scrape ALL listings for one skin+wear via a forward float cursor.

        Grab the 60 lowest-float items at/above `lo`, advance `lo` to the highest float on
        the page, repeat until a page is under the cap. The boundary item is re-fetched
        (minFloat is inclusive) and dropped by the id dedup."""
        search = urllib.parse.quote_plus(job["search"])
        max_fetches = job.get("maxPages", 40)  # safety cap on page fetches per job
        seen: dict = {}
        fetches = 0
        wire = 0
        lo = 0.0
        reason = "completed"

        while fetches < max_fetches:
            _status, body, wbytes = await page_fetch(page, PAGE.format(search=search, lo=lo))
            fetches += 1
            if wbytes > 0:
                wire += wbytes

            if "Just a moment" in body or "challenge-platform" in body:
                return ScrapeResult(list(seen.values()), fetches, "challenged", wire)

            items = extract_items(body)
            floats = []
            for it in items:
                if it.get("id") is not None:
                    seen[it["id"]] = it
                fl = (it.get("asset") or {}).get("float")
                if isinstance(fl, (int, float)):
                    floats.append(fl)

            if len(items) < PAGE_CAP:
                break  # last page — fewer than the cap means we've seen everything

            # Advance the cursor past the highest float on this page. Items at exactly that
            # float are re-fetched next round (minFloat is inclusive) and deduped by id.
            nxt = max(floats) if floats else None
            if nxt is None or nxt <= lo:
                # Cursor can't advance: >60 listings share a single float value, or the
                # items carry no float. Bail loudly rather than spin — a flagged gap beats
                # a silent one (this is the failure the price-window version hid).
                reason = "stuck-float-tie"
                break
            lo = nxt

            await self._pace(page)
        else:
            reason = "fetch-cap"

        return ScrapeResult(list(seen.values()), fetches, reason, wire)


if __name__ == "__main__":
    run(CsMoneyWorker)