Operation-Blue-Laminate-v2/worker/poc.py

"""
Proof-of-concept / pre-fleet validation for the cs.money scraper.

Proves the things we need before building the C2 + worker fleet:
  1. nodriver clears cs.money's Cloudflare where .NET Selenium couldn't.
  2. a single WARM session can page the sell-orders API deeply without re-challenge.
  3. a free-text market search (e.g. "cyber security ft") can be turned into a
     filtered sell-orders API call — we DISCOVER the real API params by capturing the
     request the page itself fires, instead of guessing.

It opens the market (optionally a search URL) in a real non-headless Chromium, lets
you clear Cloudflare, dismisses the cookie banner (privacy-preserving), captures the
sell-orders request the page makes, then pages that API from inside the cleared page
(same-origin fetch carries cf_clearance), pacing itself and stopping on re-challenge.

    cd worker
    .venv\\Scripts\\Activate.ps1
    pip install -r requirements.txt

    python poc.py                       # whole-market sweep
    $env:SEARCH="cyber security ft"; python poc.py   # targeted: FT M4A4 Cyber Security

Env knobs (all optional):
    SEARCH         free-text market search; when set, scrape only those results
    MARKET_URL     market page base (default the buy market)
    SOLVE_SECONDS  seconds to wait for you to clear Cloudflare (default 30)
    PAGES          how many offset pages (60 each) to attempt (default 20)
    START_OFFSET   first offset (default 0)
    DELAY / JITTER base + random seconds between fetches (default 2.0 / 1.5)
    PROXY          host:port for an auth-free proxy (omit to use your own IP)
    BROWSER_PATH   path to Chrome/Edge if auto-detect fails
"""

import json
import os
import pathlib
import random
from urllib.parse import quote_plus, urlsplit, parse_qsl, urlencode, urlunsplit

import nodriver as uc
from nodriver import cdp

SEARCH = os.environ.get("SEARCH")
MARKET_URL = os.environ.get("MARKET_URL", "https://cs.money/market/buy/")
SOLVE_SECONDS = int(os.environ.get("SOLVE_SECONDS", "30"))
PAGES = int(os.environ.get("PAGES", "20"))
START_OFFSET = int(os.environ.get("START_OFFSET", "0"))
DELAY = float(os.environ.get("DELAY", "2.0"))
JITTER = float(os.environ.get("JITTER", "1.5"))
PROXY = os.environ.get("PROXY")
BROWSER_PATH = os.environ.get("BROWSER_PATH")

# Fallback template if we fail to capture the page's own request (offset = {}).
DEFAULT_TEMPLATE = "https://cs.money/2.0/market/sell-orders?limit=60&offset={}"
OUT_DIR = pathlib.Path(__file__).parent / "captures"
CONSENT_LABELS = ["Reject all", "Reject All", "Only necessary", "Necessary only",
                  "Reject", "Decline", "Deny"]

# Filled by the CDP network handler with sell-orders request URLs the page fires.
_seen_urls: list[str] = []


def looks_like_challenge(body: str) -> bool:
    s = (body or "").lstrip()
    return not s or s.startswith("<") or "Just a moment" in body or "challenge-platform" in body


def decimals(v: float) -> int:
    r = repr(float(v))
    return len(r.split(".")[-1]) if "." in r else 0


def template_from(url: str) -> str:
    """Turn a captured sell-orders URL into a template with offset as '{}',
    preserving every other param (the search/filter encoding we want to learn)."""
    parts = urlsplit(url)
    q = [(k, v) for k, v in parse_qsl(parts.query, keep_blank_values=True) if k != "offset"]
    if not any(k == "limit" for k, _ in q):
        q.append(("limit", "60"))
    base_q = urlencode(q)
    new_q = (base_q + "&" if base_q else "") + "offset={}"
    return urlunsplit((parts.scheme, parts.netloc, parts.path, new_q, ""))


async def dismiss_consent(page) -> str | None:
    """Best-effort, privacy-preserving — never clicks 'Accept all'."""
    for label in CONSENT_LABELS:
        try:
            el = await page.find(label, best_match=True, timeout=2)
        except Exception:
            el = None
        if el:
            try:
                await el.click()
                return label
            except Exception:
                pass
    return None


async def fetch_json(page, url: str) -> tuple[str, str]:
    expr = (
        f"fetch({url!r}, {{credentials:'include', headers:{{'accept':'application/json'}}}})"
        f".then(async r => JSON.stringify({{status: r.status, body: await r.text()}}))"
    )
    raw = await page.evaluate(expr, await_promise=True)
    if not isinstance(raw, str):
        return ("-1", "")
    try:
        obj = json.loads(raw)
        return (str(obj.get("status", "-1")), obj.get("body", ""))
    except json.JSONDecodeError:
        return ("-1", raw)


async def main():
    OUT_DIR.mkdir(exist_ok=True)
    args = [f"--proxy-server={PROXY}"] if PROXY else []

    target_url = MARKET_URL
    tag = "market"
    if SEARCH:
        sep = "&" if "?" in MARKET_URL else "?"
        target_url = f"{MARKET_URL}{sep}search={quote_plus(SEARCH)}"
        tag = "search_" + "".join(c if c.isalnum() else "_" for c in SEARCH)[:40]

    print(f"Launching nodriver Chromium (proxy={PROXY or 'none / own IP'})...")
    browser = await uc.start(headless=False, browser_executable_path=BROWSER_PATH, browser_args=args)

    pages_ok = items_total = floats_total = low_prec = 0
    dp_min, dp_max = 99, 0
    deepest_offset = None
    reason = "completed (hit PAGES limit)"

    try:
        # Open a blank tab first so the network handler is attached BEFORE the page
        # fires its filtered sell-orders request (otherwise we'd miss it).
        page = await browser.get("about:blank")

        async def on_request(evt):
            url = evt.request.url
            if "/market/sell-orders" in url:
                _seen_urls.append(url)

        page.add_handler(cdp.network.RequestWillBeSent, on_request)
        try:
            await page.send(cdp.network.enable())
        except Exception as ex:
            print(f"(network capture unavailable: {ex})")

        print(f"Opening {target_url}")
        await page.get(target_url)
        print(f"Solve any Cloudflare challenge. Waiting {SOLVE_SECONDS}s for the grid...")
        await page.sleep(SOLVE_SECONDS)

        clicked = await dismiss_consent(page)
        print(f"Consent banner: {'dismissed via ' + clicked if clicked else 'left up (does not block fetch)'}")

        # Reliable discovery via the Resource Timing API: the browser records EVERY
        # request the page made, so we read the real sell-orders URL straight out of it
        # (no flaky CDP event timing). Also dump nearby API calls for context.
        # cs.money is an Astro SSR app — the initial filtered listings are rendered
        # server-side (no client XHR to capture). Scroll to provoke lazy-load
        # pagination, which DOES fire a client request carrying the real filter params.
        print("Scrolling to trigger lazy-load pagination...")
        for _ in range(6):
            try:
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            except Exception:
                pass
            await page.sleep(2)

        # nodriver returns arrays unreliably from evaluate(), so JSON.stringify in JS
        # and json.loads here (the string path is proven by fetch_json).
        async def js_list(expr: str) -> list:
            raw = await page.evaluate(f"JSON.stringify({expr})")
            try:
                return json.loads(raw) if isinstance(raw, str) else []
            except (json.JSONDecodeError, TypeError):
                return []

        try:
            all_urls = await js_list("performance.getEntriesByType('resource').map(e=>e.name)")
            print(f">>> Resource Timing saw {len(all_urls)} requests total")
            if all_urls:
                (OUT_DIR / "_all_requests.txt").write_text(
                    "\n".join(dict.fromkeys(all_urls)), encoding="utf-8")
            sell = [u for u in all_urls if "/market/sell-orders" in u]
            _seen_urls.extend(sell)
            api = [u for u in all_urls if "cs.money/" in u and ("/2.0/" in u or "/1.0/" in u)]
            if api:
                (OUT_DIR / "_api_calls.txt").write_text("\n".join(dict.fromkeys(api)), encoding="utf-8")
                print(f">>> {len(set(api))} cs.money API calls; saved to {OUT_DIR / '_api_calls.txt'}")
        except Exception as ex:
            print(f"(resource-timing query failed: {ex})")

        # Dump the SSR'd page so we can see how the filter is encoded and where the
        # listings data lives (Astro embeds island props / hydration JSON in the HTML).
        try:
            html = await page.evaluate("document.documentElement.outerHTML")
            if isinstance(html, str) and html:
                (OUT_DIR / "_page.html").write_text(html, encoding="utf-8")
                print(f">>> saved page HTML ({len(html)} bytes) to {OUT_DIR / '_page.html'}")
        except Exception as ex:
            print(f"(page HTML dump failed: {ex})")

        # Discovery: what sell-orders request did the page actually make?
        if _seen_urls:
            captured = _seen_urls[-1]
            template = template_from(captured)
            print("\n>>> DISCOVERED sell-orders API call the page fired:")
            print(f"    {captured}")
            print(f">>> pagination template: {template}\n")
            # Persist it — the console line is easy to lose, and this is the one bit
            # of ground truth (the real filter-param scheme) we need.
            (OUT_DIR / "_discovered.txt").write_text(
                "ALL captured sell-orders requests:\n"
                + "\n".join(dict.fromkeys(_seen_urls))
                + f"\n\npagination template:\n{template}\n",
                encoding="utf-8")
            print(f">>> saved to {OUT_DIR / '_discovered.txt'}")
        else:
            template = DEFAULT_TEMPLATE
            if SEARCH:
                template = template.replace("offset={}", f"search={quote_plus(SEARCH)}&offset={{}}")
            print(f"\n(no request captured; falling back to template: {template})\n")

        for i in range(PAGES):
            offset = START_OFFSET + i * 60
            status, body = await fetch_json(page, template.format(offset))

            if looks_like_challenge(body):
                print(f"  page {i + 1} [offset {offset}]: RE-CHALLENGED (status {status}). Stopping.")
                (OUT_DIR / f"{tag}_challenge_offset_{offset}.html").write_text(body, encoding="utf-8")
                reason = f"re-challenged at offset {offset}"
                break

            try:
                items = json.loads(body).get("items", [])
            except json.JSONDecodeError:
                print(f"  page {i + 1} [offset {offset}]: non-JSON (status {status}). Stopping.")
                reason = f"non-JSON at offset {offset}"
                break

            if not items:
                print(f"  page {i + 1} [offset {offset}]: 0 items — end of results.")
                reason = "end of results"
                break

            (OUT_DIR / f"{tag}_offset_{offset:06d}.json").write_text(body, encoding="utf-8")
            pages_ok += 1
            deepest_offset = offset
            items_total += len(items)
            names = set()
            for it in items:
                fl = it.get("asset", {}).get("float")
                if fl is not None:
                    floats_total += 1
                    d = decimals(fl)
                    dp_min, dp_max = min(dp_min, d), max(dp_max, d)
                    if d <= 6:  # short repr — exact binary fraction (e.g. 1/16), not truncation
                        low_prec += 1
                names.add(it.get("asset", {}).get("names", {}).get("full"))
            sample = next(iter(names), None) if SEARCH else None
            print(f"  page {i + 1} [offset {offset}] OK — {len(items)} items"
                  + (f" (e.g. {sample}; {len(names)} distinct names)" if SEARCH else ""))

            await page.sleep(DELAY + random.uniform(0, JITTER))

        print("\n=== summary ===")
        print(f"  query: {SEARCH or '(whole market)'}")
        print(f"  stopped: {reason}")
        print(f"  clean pages: {pages_ok}  deepest offset: {deepest_offset}  items: {items_total}")
        if floats_total:
            # Truncation would make MANY values short, not one exact binary fraction.
            verdict = "FULL precision" if low_prec / floats_total < 0.02 else "POSSIBLE TRUNCATION"
            print(f"  floats: {floats_total} items, {dp_max}-decimal max, "
                  f"{low_prec} short-repr (exact fractions) — {verdict}")
        print(f"  files in {OUT_DIR}")
    finally:
        browser.stop()


if __name__ == "__main__":
    uc.loop().run_until_complete(main())