Add cs.money worker stack with per-worker IPRoyal residential proxy

Brings up the pull-model scraper: the .NET C2 hands skin+wear jobs to Python nodriver workers that scrape cs.money and post results back, plus the supporting Core/EFCore data model, migrations, and docker-compose orchestration. IPRoyal proxying lets workers scale horizontally with a distinct residential exit IP each: every worker process mints its own sticky session at startup, and an in-process forwarding proxy injects the gateway auth so Chromium talks only to an auth-free localhost endpoint (zero CDP). On a Cloudflare challenge a worker rotates to a fresh session/IP and re-warms. Verified end-to-end against live IPRoyal: distinct US residential exits per worker and IP rotation on demand. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-31 15:03:31 -05:00
parent eb5fb0dac7
commit dc7c3f99ae
82 changed files with 8354 additions and 571 deletions
--- a/worker/poc.py
+++ b/worker/poc.py
@@ -0,0 +1,285 @@
+"""
+Proof-of-concept / pre-fleet validation for the cs.money scraper.
+
+Proves the things we need before building the C2 + worker fleet:
+  1. nodriver clears cs.money's Cloudflare where .NET Selenium couldn't.
+  2. a single WARM session can page the sell-orders API deeply without re-challenge.
+  3. a free-text market search (e.g. "cyber security ft") can be turned into a
+     filtered sell-orders API call — we DISCOVER the real API params by capturing the
+     request the page itself fires, instead of guessing.
+
+It opens the market (optionally a search URL) in a real non-headless Chromium, lets
+you clear Cloudflare, dismisses the cookie banner (privacy-preserving), captures the
+sell-orders request the page makes, then pages that API from inside the cleared page
+(same-origin fetch carries cf_clearance), pacing itself and stopping on re-challenge.
+
+    cd worker
+    .venv\\Scripts\\Activate.ps1
+    pip install -r requirements.txt
+
+    python poc.py                       # whole-market sweep
+    $env:SEARCH="cyber security ft"; python poc.py   # targeted: FT M4A4 Cyber Security
+
+Env knobs (all optional):
+    SEARCH         free-text market search; when set, scrape only those results
+    MARKET_URL     market page base (default the buy market)
+    SOLVE_SECONDS  seconds to wait for you to clear Cloudflare (default 30)
+    PAGES          how many offset pages (60 each) to attempt (default 20)
+    START_OFFSET   first offset (default 0)
+    DELAY / JITTER base + random seconds between fetches (default 2.0 / 1.5)
+    PROXY          host:port for an auth-free proxy (omit to use your own IP)
+    BROWSER_PATH   path to Chrome/Edge if auto-detect fails
+"""
+
+import json
+import os
+import pathlib
+import random
+from urllib.parse import quote_plus, urlsplit, parse_qsl, urlencode, urlunsplit
+
+import nodriver as uc
+from nodriver import cdp
+
+SEARCH = os.environ.get("SEARCH")
+MARKET_URL = os.environ.get("MARKET_URL", "https://cs.money/market/buy/")
+SOLVE_SECONDS = int(os.environ.get("SOLVE_SECONDS", "30"))
+PAGES = int(os.environ.get("PAGES", "20"))
+START_OFFSET = int(os.environ.get("START_OFFSET", "0"))
+DELAY = float(os.environ.get("DELAY", "2.0"))
+JITTER = float(os.environ.get("JITTER", "1.5"))
+PROXY = os.environ.get("PROXY")
+BROWSER_PATH = os.environ.get("BROWSER_PATH")
+
+# Fallback template if we fail to capture the page's own request (offset = {}).
+DEFAULT_TEMPLATE = "https://cs.money/2.0/market/sell-orders?limit=60&offset={}"
+OUT_DIR = pathlib.Path(__file__).parent / "captures"
+CONSENT_LABELS = ["Reject all", "Reject All", "Only necessary", "Necessary only",
+                  "Reject", "Decline", "Deny"]
+
+# Filled by the CDP network handler with sell-orders request URLs the page fires.
+_seen_urls: list[str] = []
+
+
+def looks_like_challenge(body: str) -> bool:
+    s = (body or "").lstrip()
+    return not s or s.startswith("<") or "Just a moment" in body or "challenge-platform" in body
+
+
+def decimals(v: float) -> int:
+    r = repr(float(v))
+    return len(r.split(".")[-1]) if "." in r else 0
+
+
+def template_from(url: str) -> str:
+    """Turn a captured sell-orders URL into a template with offset as '{}',
+    preserving every other param (the search/filter encoding we want to learn)."""
+    parts = urlsplit(url)
+    q = [(k, v) for k, v in parse_qsl(parts.query, keep_blank_values=True) if k != "offset"]
+    if not any(k == "limit" for k, _ in q):
+        q.append(("limit", "60"))
+    base_q = urlencode(q)
+    new_q = (base_q + "&" if base_q else "") + "offset={}"
+    return urlunsplit((parts.scheme, parts.netloc, parts.path, new_q, ""))
+
+
+async def dismiss_consent(page) -> str | None:
+    """Best-effort, privacy-preserving — never clicks 'Accept all'."""
+    for label in CONSENT_LABELS:
+        try:
+            el = await page.find(label, best_match=True, timeout=2)
+        except Exception:
+            el = None
+        if el:
+            try:
+                await el.click()
+                return label
+            except Exception:
+                pass
+    return None
+
+
+async def fetch_json(page, url: str) -> tuple[str, str]:
+    expr = (
+        f"fetch({url!r}, {{credentials:'include', headers:{{'accept':'application/json'}}}})"
+        f".then(async r => JSON.stringify({{status: r.status, body: await r.text()}}))"
+    )
+    raw = await page.evaluate(expr, await_promise=True)
+    if not isinstance(raw, str):
+        return ("-1", "")
+    try:
+        obj = json.loads(raw)
+        return (str(obj.get("status", "-1")), obj.get("body", ""))
+    except json.JSONDecodeError:
+        return ("-1", raw)
+
+
+async def main():
+    OUT_DIR.mkdir(exist_ok=True)
+    args = [f"--proxy-server={PROXY}"] if PROXY else []
+
+    target_url = MARKET_URL
+    tag = "market"
+    if SEARCH:
+        sep = "&" if "?" in MARKET_URL else "?"
+        target_url = f"{MARKET_URL}{sep}search={quote_plus(SEARCH)}"
+        tag = "search_" + "".join(c if c.isalnum() else "_" for c in SEARCH)[:40]
+
+    print(f"Launching nodriver Chromium (proxy={PROXY or 'none / own IP'})...")
+    browser = await uc.start(headless=False, browser_executable_path=BROWSER_PATH, browser_args=args)
+
+    pages_ok = items_total = floats_total = low_prec = 0
+    dp_min, dp_max = 99, 0
+    deepest_offset = None
+    reason = "completed (hit PAGES limit)"
+
+    try:
+        # Open a blank tab first so the network handler is attached BEFORE the page
+        # fires its filtered sell-orders request (otherwise we'd miss it).
+        page = await browser.get("about:blank")
+
+        async def on_request(evt):
+            url = evt.request.url
+            if "/market/sell-orders" in url:
+                _seen_urls.append(url)
+
+        page.add_handler(cdp.network.RequestWillBeSent, on_request)
+        try:
+            await page.send(cdp.network.enable())
+        except Exception as ex:
+            print(f"(network capture unavailable: {ex})")
+
+        print(f"Opening {target_url}")
+        await page.get(target_url)
+        print(f"Solve any Cloudflare challenge. Waiting {SOLVE_SECONDS}s for the grid...")
+        await page.sleep(SOLVE_SECONDS)
+
+        clicked = await dismiss_consent(page)
+        print(f"Consent banner: {'dismissed via ' + clicked if clicked else 'left up (does not block fetch)'}")
+
+        # Reliable discovery via the Resource Timing API: the browser records EVERY
+        # request the page made, so we read the real sell-orders URL straight out of it
+        # (no flaky CDP event timing). Also dump nearby API calls for context.
+        # cs.money is an Astro SSR app — the initial filtered listings are rendered
+        # server-side (no client XHR to capture). Scroll to provoke lazy-load
+        # pagination, which DOES fire a client request carrying the real filter params.
+        print("Scrolling to trigger lazy-load pagination...")
+        for _ in range(6):
+            try:
+                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+            except Exception:
+                pass
+            await page.sleep(2)
+
+        # nodriver returns arrays unreliably from evaluate(), so JSON.stringify in JS
+        # and json.loads here (the string path is proven by fetch_json).
+        async def js_list(expr: str) -> list:
+            raw = await page.evaluate(f"JSON.stringify({expr})")
+            try:
+                return json.loads(raw) if isinstance(raw, str) else []
+            except (json.JSONDecodeError, TypeError):
+                return []
+
+        try:
+            all_urls = await js_list("performance.getEntriesByType('resource').map(e=>e.name)")
+            print(f">>> Resource Timing saw {len(all_urls)} requests total")
+            if all_urls:
+                (OUT_DIR / "_all_requests.txt").write_text(
+                    "\n".join(dict.fromkeys(all_urls)), encoding="utf-8")
+            sell = [u for u in all_urls if "/market/sell-orders" in u]
+            _seen_urls.extend(sell)
+            api = [u for u in all_urls if "cs.money/" in u and ("/2.0/" in u or "/1.0/" in u)]
+            if api:
+                (OUT_DIR / "_api_calls.txt").write_text("\n".join(dict.fromkeys(api)), encoding="utf-8")
+                print(f">>> {len(set(api))} cs.money API calls; saved to {OUT_DIR / '_api_calls.txt'}")
+        except Exception as ex:
+            print(f"(resource-timing query failed: {ex})")
+
+        # Dump the SSR'd page so we can see how the filter is encoded and where the
+        # listings data lives (Astro embeds island props / hydration JSON in the HTML).
+        try:
+            html = await page.evaluate("document.documentElement.outerHTML")
+            if isinstance(html, str) and html:
+                (OUT_DIR / "_page.html").write_text(html, encoding="utf-8")
+                print(f">>> saved page HTML ({len(html)} bytes) to {OUT_DIR / '_page.html'}")
+        except Exception as ex:
+            print(f"(page HTML dump failed: {ex})")
+
+        # Discovery: what sell-orders request did the page actually make?
+        if _seen_urls:
+            captured = _seen_urls[-1]
+            template = template_from(captured)
+            print("\n>>> DISCOVERED sell-orders API call the page fired:")
+            print(f"    {captured}")
+            print(f">>> pagination template: {template}\n")
+            # Persist it — the console line is easy to lose, and this is the one bit
+            # of ground truth (the real filter-param scheme) we need.
+            (OUT_DIR / "_discovered.txt").write_text(
+                "ALL captured sell-orders requests:\n"
+                + "\n".join(dict.fromkeys(_seen_urls))
+                + f"\n\npagination template:\n{template}\n",
+                encoding="utf-8")
+            print(f">>> saved to {OUT_DIR / '_discovered.txt'}")
+        else:
+            template = DEFAULT_TEMPLATE
+            if SEARCH:
+                template = template.replace("offset={}", f"search={quote_plus(SEARCH)}&offset={{}}")
+            print(f"\n(no request captured; falling back to template: {template})\n")
+
+        for i in range(PAGES):
+            offset = START_OFFSET + i * 60
+            status, body = await fetch_json(page, template.format(offset))
+
+            if looks_like_challenge(body):
+                print(f"  page {i + 1} [offset {offset}]: RE-CHALLENGED (status {status}). Stopping.")
+                (OUT_DIR / f"{tag}_challenge_offset_{offset}.html").write_text(body, encoding="utf-8")
+                reason = f"re-challenged at offset {offset}"
+                break
+
+            try:
+                items = json.loads(body).get("items", [])
+            except json.JSONDecodeError:
+                print(f"  page {i + 1} [offset {offset}]: non-JSON (status {status}). Stopping.")
+                reason = f"non-JSON at offset {offset}"
+                break
+
+            if not items:
+                print(f"  page {i + 1} [offset {offset}]: 0 items — end of results.")
+                reason = "end of results"
+                break
+
+            (OUT_DIR / f"{tag}_offset_{offset:06d}.json").write_text(body, encoding="utf-8")
+            pages_ok += 1
+            deepest_offset = offset
+            items_total += len(items)
+            names = set()
+            for it in items:
+                fl = it.get("asset", {}).get("float")
+                if fl is not None:
+                    floats_total += 1
+                    d = decimals(fl)
+                    dp_min, dp_max = min(dp_min, d), max(dp_max, d)
+                    if d <= 6:  # short repr — exact binary fraction (e.g. 1/16), not truncation
+                        low_prec += 1
+                names.add(it.get("asset", {}).get("names", {}).get("full"))
+            sample = next(iter(names), None) if SEARCH else None
+            print(f"  page {i + 1} [offset {offset}] OK — {len(items)} items"
+                  + (f" (e.g. {sample}; {len(names)} distinct names)" if SEARCH else ""))
+
+            await page.sleep(DELAY + random.uniform(0, JITTER))
+
+        print("\n=== summary ===")
+        print(f"  query: {SEARCH or '(whole market)'}")
+        print(f"  stopped: {reason}")
+        print(f"  clean pages: {pages_ok}  deepest offset: {deepest_offset}  items: {items_total}")
+        if floats_total:
+            # Truncation would make MANY values short, not one exact binary fraction.
+            verdict = "FULL precision" if low_prec / floats_total < 0.02 else "POSSIBLE TRUNCATION"
+            print(f"  floats: {floats_total} items, {dp_max}-decimal max, "
+                  f"{low_prec} short-repr (exact fractions) — {verdict}")
+        print(f"  files in {OUT_DIR}")
+    finally:
+        browser.stop()
+
+
+if __name__ == "__main__":
+    uc.loop().run_until_complete(main())