""" Proof-of-concept / pre-fleet validation for the cs.money scraper. Proves the things we need before building the C2 + worker fleet: 1. nodriver clears cs.money's Cloudflare where .NET Selenium couldn't. 2. a single WARM session can page the sell-orders API deeply without re-challenge. 3. a free-text market search (e.g. "cyber security ft") can be turned into a filtered sell-orders API call — we DISCOVER the real API params by capturing the request the page itself fires, instead of guessing. It opens the market (optionally a search URL) in a real non-headless Chromium, lets you clear Cloudflare, dismisses the cookie banner (privacy-preserving), captures the sell-orders request the page makes, then pages that API from inside the cleared page (same-origin fetch carries cf_clearance), pacing itself and stopping on re-challenge. cd worker .venv\\Scripts\\Activate.ps1 pip install -r requirements.txt python poc.py # whole-market sweep $env:SEARCH="cyber security ft"; python poc.py # targeted: FT M4A4 Cyber Security Env knobs (all optional): SEARCH free-text market search; when set, scrape only those results MARKET_URL market page base (default the buy market) SOLVE_SECONDS seconds to wait for you to clear Cloudflare (default 30) PAGES how many offset pages (60 each) to attempt (default 20) START_OFFSET first offset (default 0) DELAY / JITTER base + random seconds between fetches (default 2.0 / 1.5) PROXY host:port for an auth-free proxy (omit to use your own IP) BROWSER_PATH path to Chrome/Edge if auto-detect fails """ import json import os import pathlib import random from urllib.parse import quote_plus, urlsplit, parse_qsl, urlencode, urlunsplit import nodriver as uc from nodriver import cdp SEARCH = os.environ.get("SEARCH") MARKET_URL = os.environ.get("MARKET_URL", "https://cs.money/market/buy/") SOLVE_SECONDS = int(os.environ.get("SOLVE_SECONDS", "30")) PAGES = int(os.environ.get("PAGES", "20")) START_OFFSET = int(os.environ.get("START_OFFSET", "0")) DELAY = float(os.environ.get("DELAY", "2.0")) JITTER = float(os.environ.get("JITTER", "1.5")) PROXY = os.environ.get("PROXY") BROWSER_PATH = os.environ.get("BROWSER_PATH") # Fallback template if we fail to capture the page's own request (offset = {}). DEFAULT_TEMPLATE = "https://cs.money/2.0/market/sell-orders?limit=60&offset={}" OUT_DIR = pathlib.Path(__file__).parent / "captures" CONSENT_LABELS = ["Reject all", "Reject All", "Only necessary", "Necessary only", "Reject", "Decline", "Deny"] # Filled by the CDP network handler with sell-orders request URLs the page fires. _seen_urls: list[str] = [] def looks_like_challenge(body: str) -> bool: s = (body or "").lstrip() return not s or s.startswith("<") or "Just a moment" in body or "challenge-platform" in body def decimals(v: float) -> int: r = repr(float(v)) return len(r.split(".")[-1]) if "." in r else 0 def template_from(url: str) -> str: """Turn a captured sell-orders URL into a template with offset as '{}', preserving every other param (the search/filter encoding we want to learn).""" parts = urlsplit(url) q = [(k, v) for k, v in parse_qsl(parts.query, keep_blank_values=True) if k != "offset"] if not any(k == "limit" for k, _ in q): q.append(("limit", "60")) base_q = urlencode(q) new_q = (base_q + "&" if base_q else "") + "offset={}" return urlunsplit((parts.scheme, parts.netloc, parts.path, new_q, "")) async def dismiss_consent(page) -> str | None: """Best-effort, privacy-preserving — never clicks 'Accept all'.""" for label in CONSENT_LABELS: try: el = await page.find(label, best_match=True, timeout=2) except Exception: el = None if el: try: await el.click() return label except Exception: pass return None async def fetch_json(page, url: str) -> tuple[str, str]: expr = ( f"fetch({url!r}, {{credentials:'include', headers:{{'accept':'application/json'}}}})" f".then(async r => JSON.stringify({{status: r.status, body: await r.text()}}))" ) raw = await page.evaluate(expr, await_promise=True) if not isinstance(raw, str): return ("-1", "") try: obj = json.loads(raw) return (str(obj.get("status", "-1")), obj.get("body", "")) except json.JSONDecodeError: return ("-1", raw) async def main(): OUT_DIR.mkdir(exist_ok=True) args = [f"--proxy-server={PROXY}"] if PROXY else [] target_url = MARKET_URL tag = "market" if SEARCH: sep = "&" if "?" in MARKET_URL else "?" target_url = f"{MARKET_URL}{sep}search={quote_plus(SEARCH)}" tag = "search_" + "".join(c if c.isalnum() else "_" for c in SEARCH)[:40] print(f"Launching nodriver Chromium (proxy={PROXY or 'none / own IP'})...") browser = await uc.start(headless=False, browser_executable_path=BROWSER_PATH, browser_args=args) pages_ok = items_total = floats_total = low_prec = 0 dp_min, dp_max = 99, 0 deepest_offset = None reason = "completed (hit PAGES limit)" try: # Open a blank tab first so the network handler is attached BEFORE the page # fires its filtered sell-orders request (otherwise we'd miss it). page = await browser.get("about:blank") async def on_request(evt): url = evt.request.url if "/market/sell-orders" in url: _seen_urls.append(url) page.add_handler(cdp.network.RequestWillBeSent, on_request) try: await page.send(cdp.network.enable()) except Exception as ex: print(f"(network capture unavailable: {ex})") print(f"Opening {target_url}") await page.get(target_url) print(f"Solve any Cloudflare challenge. Waiting {SOLVE_SECONDS}s for the grid...") await page.sleep(SOLVE_SECONDS) clicked = await dismiss_consent(page) print(f"Consent banner: {'dismissed via ' + clicked if clicked else 'left up (does not block fetch)'}") # Reliable discovery via the Resource Timing API: the browser records EVERY # request the page made, so we read the real sell-orders URL straight out of it # (no flaky CDP event timing). Also dump nearby API calls for context. # cs.money is an Astro SSR app — the initial filtered listings are rendered # server-side (no client XHR to capture). Scroll to provoke lazy-load # pagination, which DOES fire a client request carrying the real filter params. print("Scrolling to trigger lazy-load pagination...") for _ in range(6): try: await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") except Exception: pass await page.sleep(2) # nodriver returns arrays unreliably from evaluate(), so JSON.stringify in JS # and json.loads here (the string path is proven by fetch_json). async def js_list(expr: str) -> list: raw = await page.evaluate(f"JSON.stringify({expr})") try: return json.loads(raw) if isinstance(raw, str) else [] except (json.JSONDecodeError, TypeError): return [] try: all_urls = await js_list("performance.getEntriesByType('resource').map(e=>e.name)") print(f">>> Resource Timing saw {len(all_urls)} requests total") if all_urls: (OUT_DIR / "_all_requests.txt").write_text( "\n".join(dict.fromkeys(all_urls)), encoding="utf-8") sell = [u for u in all_urls if "/market/sell-orders" in u] _seen_urls.extend(sell) api = [u for u in all_urls if "cs.money/" in u and ("/2.0/" in u or "/1.0/" in u)] if api: (OUT_DIR / "_api_calls.txt").write_text("\n".join(dict.fromkeys(api)), encoding="utf-8") print(f">>> {len(set(api))} cs.money API calls; saved to {OUT_DIR / '_api_calls.txt'}") except Exception as ex: print(f"(resource-timing query failed: {ex})") # Dump the SSR'd page so we can see how the filter is encoded and where the # listings data lives (Astro embeds island props / hydration JSON in the HTML). try: html = await page.evaluate("document.documentElement.outerHTML") if isinstance(html, str) and html: (OUT_DIR / "_page.html").write_text(html, encoding="utf-8") print(f">>> saved page HTML ({len(html)} bytes) to {OUT_DIR / '_page.html'}") except Exception as ex: print(f"(page HTML dump failed: {ex})") # Discovery: what sell-orders request did the page actually make? if _seen_urls: captured = _seen_urls[-1] template = template_from(captured) print("\n>>> DISCOVERED sell-orders API call the page fired:") print(f" {captured}") print(f">>> pagination template: {template}\n") # Persist it — the console line is easy to lose, and this is the one bit # of ground truth (the real filter-param scheme) we need. (OUT_DIR / "_discovered.txt").write_text( "ALL captured sell-orders requests:\n" + "\n".join(dict.fromkeys(_seen_urls)) + f"\n\npagination template:\n{template}\n", encoding="utf-8") print(f">>> saved to {OUT_DIR / '_discovered.txt'}") else: template = DEFAULT_TEMPLATE if SEARCH: template = template.replace("offset={}", f"search={quote_plus(SEARCH)}&offset={{}}") print(f"\n(no request captured; falling back to template: {template})\n") for i in range(PAGES): offset = START_OFFSET + i * 60 status, body = await fetch_json(page, template.format(offset)) if looks_like_challenge(body): print(f" page {i + 1} [offset {offset}]: RE-CHALLENGED (status {status}). Stopping.") (OUT_DIR / f"{tag}_challenge_offset_{offset}.html").write_text(body, encoding="utf-8") reason = f"re-challenged at offset {offset}" break try: items = json.loads(body).get("items", []) except json.JSONDecodeError: print(f" page {i + 1} [offset {offset}]: non-JSON (status {status}). Stopping.") reason = f"non-JSON at offset {offset}" break if not items: print(f" page {i + 1} [offset {offset}]: 0 items — end of results.") reason = "end of results" break (OUT_DIR / f"{tag}_offset_{offset:06d}.json").write_text(body, encoding="utf-8") pages_ok += 1 deepest_offset = offset items_total += len(items) names = set() for it in items: fl = it.get("asset", {}).get("float") if fl is not None: floats_total += 1 d = decimals(fl) dp_min, dp_max = min(dp_min, d), max(dp_max, d) if d <= 6: # short repr — exact binary fraction (e.g. 1/16), not truncation low_prec += 1 names.add(it.get("asset", {}).get("names", {}).get("full")) sample = next(iter(names), None) if SEARCH else None print(f" page {i + 1} [offset {offset}] OK — {len(items)} items" + (f" (e.g. {sample}; {len(names)} distinct names)" if SEARCH else "")) await page.sleep(DELAY + random.uniform(0, JITTER)) print("\n=== summary ===") print(f" query: {SEARCH or '(whole market)'}") print(f" stopped: {reason}") print(f" clean pages: {pages_ok} deepest offset: {deepest_offset} items: {items_total}") if floats_total: # Truncation would make MANY values short, not one exact binary fraction. verdict = "FULL precision" if low_prec / floats_total < 0.02 else "POSSIBLE TRUNCATION" print(f" floats: {floats_total} items, {dp_max}-decimal max, " f"{low_prec} short-repr (exact fractions) — {verdict}") print(f" files in {OUT_DIR}") finally: browser.stop() if __name__ == "__main__": uc.loop().run_until_complete(main())