Files
Operation-Blue-Laminate-v2/worker/poc.py
bob dc7c3f99ae Add cs.money worker stack with per-worker IPRoyal residential proxy
Brings up the pull-model scraper: the .NET C2 hands skin+wear jobs to Python nodriver workers that scrape cs.money and post results back, plus the supporting Core/EFCore data model, migrations, and docker-compose orchestration.

IPRoyal proxying lets workers scale horizontally with a distinct residential exit IP each: every worker process mints its own sticky session at startup, and an in-process forwarding proxy injects the gateway auth so Chromium talks only to an auth-free localhost endpoint (zero CDP). On a Cloudflare challenge a worker rotates to a fresh session/IP and re-warms. Verified end-to-end against live IPRoyal: distinct US residential exits per worker and IP rotation on demand.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-31 15:03:53 -05:00

286 lines
12 KiB
Python

"""
Proof-of-concept / pre-fleet validation for the cs.money scraper.
Proves the things we need before building the C2 + worker fleet:
1. nodriver clears cs.money's Cloudflare where .NET Selenium couldn't.
2. a single WARM session can page the sell-orders API deeply without re-challenge.
3. a free-text market search (e.g. "cyber security ft") can be turned into a
filtered sell-orders API call — we DISCOVER the real API params by capturing the
request the page itself fires, instead of guessing.
It opens the market (optionally a search URL) in a real non-headless Chromium, lets
you clear Cloudflare, dismisses the cookie banner (privacy-preserving), captures the
sell-orders request the page makes, then pages that API from inside the cleared page
(same-origin fetch carries cf_clearance), pacing itself and stopping on re-challenge.
cd worker
.venv\\Scripts\\Activate.ps1
pip install -r requirements.txt
python poc.py # whole-market sweep
$env:SEARCH="cyber security ft"; python poc.py # targeted: FT M4A4 Cyber Security
Env knobs (all optional):
SEARCH free-text market search; when set, scrape only those results
MARKET_URL market page base (default the buy market)
SOLVE_SECONDS seconds to wait for you to clear Cloudflare (default 30)
PAGES how many offset pages (60 each) to attempt (default 20)
START_OFFSET first offset (default 0)
DELAY / JITTER base + random seconds between fetches (default 2.0 / 1.5)
PROXY host:port for an auth-free proxy (omit to use your own IP)
BROWSER_PATH path to Chrome/Edge if auto-detect fails
"""
import json
import os
import pathlib
import random
from urllib.parse import quote_plus, urlsplit, parse_qsl, urlencode, urlunsplit
import nodriver as uc
from nodriver import cdp
SEARCH = os.environ.get("SEARCH")
MARKET_URL = os.environ.get("MARKET_URL", "https://cs.money/market/buy/")
SOLVE_SECONDS = int(os.environ.get("SOLVE_SECONDS", "30"))
PAGES = int(os.environ.get("PAGES", "20"))
START_OFFSET = int(os.environ.get("START_OFFSET", "0"))
DELAY = float(os.environ.get("DELAY", "2.0"))
JITTER = float(os.environ.get("JITTER", "1.5"))
PROXY = os.environ.get("PROXY")
BROWSER_PATH = os.environ.get("BROWSER_PATH")
# Fallback template if we fail to capture the page's own request (offset = {}).
DEFAULT_TEMPLATE = "https://cs.money/2.0/market/sell-orders?limit=60&offset={}"
OUT_DIR = pathlib.Path(__file__).parent / "captures"
CONSENT_LABELS = ["Reject all", "Reject All", "Only necessary", "Necessary only",
"Reject", "Decline", "Deny"]
# Filled by the CDP network handler with sell-orders request URLs the page fires.
_seen_urls: list[str] = []
def looks_like_challenge(body: str) -> bool:
s = (body or "").lstrip()
return not s or s.startswith("<") or "Just a moment" in body or "challenge-platform" in body
def decimals(v: float) -> int:
r = repr(float(v))
return len(r.split(".")[-1]) if "." in r else 0
def template_from(url: str) -> str:
"""Turn a captured sell-orders URL into a template with offset as '{}',
preserving every other param (the search/filter encoding we want to learn)."""
parts = urlsplit(url)
q = [(k, v) for k, v in parse_qsl(parts.query, keep_blank_values=True) if k != "offset"]
if not any(k == "limit" for k, _ in q):
q.append(("limit", "60"))
base_q = urlencode(q)
new_q = (base_q + "&" if base_q else "") + "offset={}"
return urlunsplit((parts.scheme, parts.netloc, parts.path, new_q, ""))
async def dismiss_consent(page) -> str | None:
"""Best-effort, privacy-preserving — never clicks 'Accept all'."""
for label in CONSENT_LABELS:
try:
el = await page.find(label, best_match=True, timeout=2)
except Exception:
el = None
if el:
try:
await el.click()
return label
except Exception:
pass
return None
async def fetch_json(page, url: str) -> tuple[str, str]:
expr = (
f"fetch({url!r}, {{credentials:'include', headers:{{'accept':'application/json'}}}})"
f".then(async r => JSON.stringify({{status: r.status, body: await r.text()}}))"
)
raw = await page.evaluate(expr, await_promise=True)
if not isinstance(raw, str):
return ("-1", "")
try:
obj = json.loads(raw)
return (str(obj.get("status", "-1")), obj.get("body", ""))
except json.JSONDecodeError:
return ("-1", raw)
async def main():
OUT_DIR.mkdir(exist_ok=True)
args = [f"--proxy-server={PROXY}"] if PROXY else []
target_url = MARKET_URL
tag = "market"
if SEARCH:
sep = "&" if "?" in MARKET_URL else "?"
target_url = f"{MARKET_URL}{sep}search={quote_plus(SEARCH)}"
tag = "search_" + "".join(c if c.isalnum() else "_" for c in SEARCH)[:40]
print(f"Launching nodriver Chromium (proxy={PROXY or 'none / own IP'})...")
browser = await uc.start(headless=False, browser_executable_path=BROWSER_PATH, browser_args=args)
pages_ok = items_total = floats_total = low_prec = 0
dp_min, dp_max = 99, 0
deepest_offset = None
reason = "completed (hit PAGES limit)"
try:
# Open a blank tab first so the network handler is attached BEFORE the page
# fires its filtered sell-orders request (otherwise we'd miss it).
page = await browser.get("about:blank")
async def on_request(evt):
url = evt.request.url
if "/market/sell-orders" in url:
_seen_urls.append(url)
page.add_handler(cdp.network.RequestWillBeSent, on_request)
try:
await page.send(cdp.network.enable())
except Exception as ex:
print(f"(network capture unavailable: {ex})")
print(f"Opening {target_url}")
await page.get(target_url)
print(f"Solve any Cloudflare challenge. Waiting {SOLVE_SECONDS}s for the grid...")
await page.sleep(SOLVE_SECONDS)
clicked = await dismiss_consent(page)
print(f"Consent banner: {'dismissed via ' + clicked if clicked else 'left up (does not block fetch)'}")
# Reliable discovery via the Resource Timing API: the browser records EVERY
# request the page made, so we read the real sell-orders URL straight out of it
# (no flaky CDP event timing). Also dump nearby API calls for context.
# cs.money is an Astro SSR app — the initial filtered listings are rendered
# server-side (no client XHR to capture). Scroll to provoke lazy-load
# pagination, which DOES fire a client request carrying the real filter params.
print("Scrolling to trigger lazy-load pagination...")
for _ in range(6):
try:
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
except Exception:
pass
await page.sleep(2)
# nodriver returns arrays unreliably from evaluate(), so JSON.stringify in JS
# and json.loads here (the string path is proven by fetch_json).
async def js_list(expr: str) -> list:
raw = await page.evaluate(f"JSON.stringify({expr})")
try:
return json.loads(raw) if isinstance(raw, str) else []
except (json.JSONDecodeError, TypeError):
return []
try:
all_urls = await js_list("performance.getEntriesByType('resource').map(e=>e.name)")
print(f">>> Resource Timing saw {len(all_urls)} requests total")
if all_urls:
(OUT_DIR / "_all_requests.txt").write_text(
"\n".join(dict.fromkeys(all_urls)), encoding="utf-8")
sell = [u for u in all_urls if "/market/sell-orders" in u]
_seen_urls.extend(sell)
api = [u for u in all_urls if "cs.money/" in u and ("/2.0/" in u or "/1.0/" in u)]
if api:
(OUT_DIR / "_api_calls.txt").write_text("\n".join(dict.fromkeys(api)), encoding="utf-8")
print(f">>> {len(set(api))} cs.money API calls; saved to {OUT_DIR / '_api_calls.txt'}")
except Exception as ex:
print(f"(resource-timing query failed: {ex})")
# Dump the SSR'd page so we can see how the filter is encoded and where the
# listings data lives (Astro embeds island props / hydration JSON in the HTML).
try:
html = await page.evaluate("document.documentElement.outerHTML")
if isinstance(html, str) and html:
(OUT_DIR / "_page.html").write_text(html, encoding="utf-8")
print(f">>> saved page HTML ({len(html)} bytes) to {OUT_DIR / '_page.html'}")
except Exception as ex:
print(f"(page HTML dump failed: {ex})")
# Discovery: what sell-orders request did the page actually make?
if _seen_urls:
captured = _seen_urls[-1]
template = template_from(captured)
print("\n>>> DISCOVERED sell-orders API call the page fired:")
print(f" {captured}")
print(f">>> pagination template: {template}\n")
# Persist it — the console line is easy to lose, and this is the one bit
# of ground truth (the real filter-param scheme) we need.
(OUT_DIR / "_discovered.txt").write_text(
"ALL captured sell-orders requests:\n"
+ "\n".join(dict.fromkeys(_seen_urls))
+ f"\n\npagination template:\n{template}\n",
encoding="utf-8")
print(f">>> saved to {OUT_DIR / '_discovered.txt'}")
else:
template = DEFAULT_TEMPLATE
if SEARCH:
template = template.replace("offset={}", f"search={quote_plus(SEARCH)}&offset={{}}")
print(f"\n(no request captured; falling back to template: {template})\n")
for i in range(PAGES):
offset = START_OFFSET + i * 60
status, body = await fetch_json(page, template.format(offset))
if looks_like_challenge(body):
print(f" page {i + 1} [offset {offset}]: RE-CHALLENGED (status {status}). Stopping.")
(OUT_DIR / f"{tag}_challenge_offset_{offset}.html").write_text(body, encoding="utf-8")
reason = f"re-challenged at offset {offset}"
break
try:
items = json.loads(body).get("items", [])
except json.JSONDecodeError:
print(f" page {i + 1} [offset {offset}]: non-JSON (status {status}). Stopping.")
reason = f"non-JSON at offset {offset}"
break
if not items:
print(f" page {i + 1} [offset {offset}]: 0 items — end of results.")
reason = "end of results"
break
(OUT_DIR / f"{tag}_offset_{offset:06d}.json").write_text(body, encoding="utf-8")
pages_ok += 1
deepest_offset = offset
items_total += len(items)
names = set()
for it in items:
fl = it.get("asset", {}).get("float")
if fl is not None:
floats_total += 1
d = decimals(fl)
dp_min, dp_max = min(dp_min, d), max(dp_max, d)
if d <= 6: # short repr — exact binary fraction (e.g. 1/16), not truncation
low_prec += 1
names.add(it.get("asset", {}).get("names", {}).get("full"))
sample = next(iter(names), None) if SEARCH else None
print(f" page {i + 1} [offset {offset}] OK — {len(items)} items"
+ (f" (e.g. {sample}; {len(names)} distinct names)" if SEARCH else ""))
await page.sleep(DELAY + random.uniform(0, JITTER))
print("\n=== summary ===")
print(f" query: {SEARCH or '(whole market)'}")
print(f" stopped: {reason}")
print(f" clean pages: {pages_ok} deepest offset: {deepest_offset} items: {items_total}")
if floats_total:
# Truncation would make MANY values short, not one exact binary fraction.
verdict = "FULL precision" if low_prec / floats_total < 0.02 else "POSSIBLE TRUNCATION"
print(f" floats: {floats_total} items, {dp_max}-decimal max, "
f"{low_prec} short-repr (exact fractions) — {verdict}")
print(f" files in {OUT_DIR}")
finally:
browser.stop()
if __name__ == "__main__":
uc.loop().run_until_complete(main())