Brings up the pull-model scraper: the .NET C2 hands skin+wear jobs to Python nodriver workers that scrape cs.money and post results back, plus the supporting Core/EFCore data model, migrations, and docker-compose orchestration. IPRoyal proxying lets workers scale horizontally with a distinct residential exit IP each: every worker process mints its own sticky session at startup, and an in-process forwarding proxy injects the gateway auth so Chromium talks only to an auth-free localhost endpoint (zero CDP). On a Cloudflare challenge a worker rotates to a fresh session/IP and re-warms. Verified end-to-end against live IPRoyal: distinct US residential exits per worker and IP rotation on demand. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
286 lines
12 KiB
Python
286 lines
12 KiB
Python
"""
|
|
Proof-of-concept / pre-fleet validation for the cs.money scraper.
|
|
|
|
Proves the things we need before building the C2 + worker fleet:
|
|
1. nodriver clears cs.money's Cloudflare where .NET Selenium couldn't.
|
|
2. a single WARM session can page the sell-orders API deeply without re-challenge.
|
|
3. a free-text market search (e.g. "cyber security ft") can be turned into a
|
|
filtered sell-orders API call — we DISCOVER the real API params by capturing the
|
|
request the page itself fires, instead of guessing.
|
|
|
|
It opens the market (optionally a search URL) in a real non-headless Chromium, lets
|
|
you clear Cloudflare, dismisses the cookie banner (privacy-preserving), captures the
|
|
sell-orders request the page makes, then pages that API from inside the cleared page
|
|
(same-origin fetch carries cf_clearance), pacing itself and stopping on re-challenge.
|
|
|
|
cd worker
|
|
.venv\\Scripts\\Activate.ps1
|
|
pip install -r requirements.txt
|
|
|
|
python poc.py # whole-market sweep
|
|
$env:SEARCH="cyber security ft"; python poc.py # targeted: FT M4A4 Cyber Security
|
|
|
|
Env knobs (all optional):
|
|
SEARCH free-text market search; when set, scrape only those results
|
|
MARKET_URL market page base (default the buy market)
|
|
SOLVE_SECONDS seconds to wait for you to clear Cloudflare (default 30)
|
|
PAGES how many offset pages (60 each) to attempt (default 20)
|
|
START_OFFSET first offset (default 0)
|
|
DELAY / JITTER base + random seconds between fetches (default 2.0 / 1.5)
|
|
PROXY host:port for an auth-free proxy (omit to use your own IP)
|
|
BROWSER_PATH path to Chrome/Edge if auto-detect fails
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import pathlib
|
|
import random
|
|
from urllib.parse import quote_plus, urlsplit, parse_qsl, urlencode, urlunsplit
|
|
|
|
import nodriver as uc
|
|
from nodriver import cdp
|
|
|
|
SEARCH = os.environ.get("SEARCH")
|
|
MARKET_URL = os.environ.get("MARKET_URL", "https://cs.money/market/buy/")
|
|
SOLVE_SECONDS = int(os.environ.get("SOLVE_SECONDS", "30"))
|
|
PAGES = int(os.environ.get("PAGES", "20"))
|
|
START_OFFSET = int(os.environ.get("START_OFFSET", "0"))
|
|
DELAY = float(os.environ.get("DELAY", "2.0"))
|
|
JITTER = float(os.environ.get("JITTER", "1.5"))
|
|
PROXY = os.environ.get("PROXY")
|
|
BROWSER_PATH = os.environ.get("BROWSER_PATH")
|
|
|
|
# Fallback template if we fail to capture the page's own request (offset = {}).
|
|
DEFAULT_TEMPLATE = "https://cs.money/2.0/market/sell-orders?limit=60&offset={}"
|
|
OUT_DIR = pathlib.Path(__file__).parent / "captures"
|
|
CONSENT_LABELS = ["Reject all", "Reject All", "Only necessary", "Necessary only",
|
|
"Reject", "Decline", "Deny"]
|
|
|
|
# Filled by the CDP network handler with sell-orders request URLs the page fires.
|
|
_seen_urls: list[str] = []
|
|
|
|
|
|
def looks_like_challenge(body: str) -> bool:
|
|
s = (body or "").lstrip()
|
|
return not s or s.startswith("<") or "Just a moment" in body or "challenge-platform" in body
|
|
|
|
|
|
def decimals(v: float) -> int:
|
|
r = repr(float(v))
|
|
return len(r.split(".")[-1]) if "." in r else 0
|
|
|
|
|
|
def template_from(url: str) -> str:
|
|
"""Turn a captured sell-orders URL into a template with offset as '{}',
|
|
preserving every other param (the search/filter encoding we want to learn)."""
|
|
parts = urlsplit(url)
|
|
q = [(k, v) for k, v in parse_qsl(parts.query, keep_blank_values=True) if k != "offset"]
|
|
if not any(k == "limit" for k, _ in q):
|
|
q.append(("limit", "60"))
|
|
base_q = urlencode(q)
|
|
new_q = (base_q + "&" if base_q else "") + "offset={}"
|
|
return urlunsplit((parts.scheme, parts.netloc, parts.path, new_q, ""))
|
|
|
|
|
|
async def dismiss_consent(page) -> str | None:
|
|
"""Best-effort, privacy-preserving — never clicks 'Accept all'."""
|
|
for label in CONSENT_LABELS:
|
|
try:
|
|
el = await page.find(label, best_match=True, timeout=2)
|
|
except Exception:
|
|
el = None
|
|
if el:
|
|
try:
|
|
await el.click()
|
|
return label
|
|
except Exception:
|
|
pass
|
|
return None
|
|
|
|
|
|
async def fetch_json(page, url: str) -> tuple[str, str]:
|
|
expr = (
|
|
f"fetch({url!r}, {{credentials:'include', headers:{{'accept':'application/json'}}}})"
|
|
f".then(async r => JSON.stringify({{status: r.status, body: await r.text()}}))"
|
|
)
|
|
raw = await page.evaluate(expr, await_promise=True)
|
|
if not isinstance(raw, str):
|
|
return ("-1", "")
|
|
try:
|
|
obj = json.loads(raw)
|
|
return (str(obj.get("status", "-1")), obj.get("body", ""))
|
|
except json.JSONDecodeError:
|
|
return ("-1", raw)
|
|
|
|
|
|
async def main():
|
|
OUT_DIR.mkdir(exist_ok=True)
|
|
args = [f"--proxy-server={PROXY}"] if PROXY else []
|
|
|
|
target_url = MARKET_URL
|
|
tag = "market"
|
|
if SEARCH:
|
|
sep = "&" if "?" in MARKET_URL else "?"
|
|
target_url = f"{MARKET_URL}{sep}search={quote_plus(SEARCH)}"
|
|
tag = "search_" + "".join(c if c.isalnum() else "_" for c in SEARCH)[:40]
|
|
|
|
print(f"Launching nodriver Chromium (proxy={PROXY or 'none / own IP'})...")
|
|
browser = await uc.start(headless=False, browser_executable_path=BROWSER_PATH, browser_args=args)
|
|
|
|
pages_ok = items_total = floats_total = low_prec = 0
|
|
dp_min, dp_max = 99, 0
|
|
deepest_offset = None
|
|
reason = "completed (hit PAGES limit)"
|
|
|
|
try:
|
|
# Open a blank tab first so the network handler is attached BEFORE the page
|
|
# fires its filtered sell-orders request (otherwise we'd miss it).
|
|
page = await browser.get("about:blank")
|
|
|
|
async def on_request(evt):
|
|
url = evt.request.url
|
|
if "/market/sell-orders" in url:
|
|
_seen_urls.append(url)
|
|
|
|
page.add_handler(cdp.network.RequestWillBeSent, on_request)
|
|
try:
|
|
await page.send(cdp.network.enable())
|
|
except Exception as ex:
|
|
print(f"(network capture unavailable: {ex})")
|
|
|
|
print(f"Opening {target_url}")
|
|
await page.get(target_url)
|
|
print(f"Solve any Cloudflare challenge. Waiting {SOLVE_SECONDS}s for the grid...")
|
|
await page.sleep(SOLVE_SECONDS)
|
|
|
|
clicked = await dismiss_consent(page)
|
|
print(f"Consent banner: {'dismissed via ' + clicked if clicked else 'left up (does not block fetch)'}")
|
|
|
|
# Reliable discovery via the Resource Timing API: the browser records EVERY
|
|
# request the page made, so we read the real sell-orders URL straight out of it
|
|
# (no flaky CDP event timing). Also dump nearby API calls for context.
|
|
# cs.money is an Astro SSR app — the initial filtered listings are rendered
|
|
# server-side (no client XHR to capture). Scroll to provoke lazy-load
|
|
# pagination, which DOES fire a client request carrying the real filter params.
|
|
print("Scrolling to trigger lazy-load pagination...")
|
|
for _ in range(6):
|
|
try:
|
|
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
except Exception:
|
|
pass
|
|
await page.sleep(2)
|
|
|
|
# nodriver returns arrays unreliably from evaluate(), so JSON.stringify in JS
|
|
# and json.loads here (the string path is proven by fetch_json).
|
|
async def js_list(expr: str) -> list:
|
|
raw = await page.evaluate(f"JSON.stringify({expr})")
|
|
try:
|
|
return json.loads(raw) if isinstance(raw, str) else []
|
|
except (json.JSONDecodeError, TypeError):
|
|
return []
|
|
|
|
try:
|
|
all_urls = await js_list("performance.getEntriesByType('resource').map(e=>e.name)")
|
|
print(f">>> Resource Timing saw {len(all_urls)} requests total")
|
|
if all_urls:
|
|
(OUT_DIR / "_all_requests.txt").write_text(
|
|
"\n".join(dict.fromkeys(all_urls)), encoding="utf-8")
|
|
sell = [u for u in all_urls if "/market/sell-orders" in u]
|
|
_seen_urls.extend(sell)
|
|
api = [u for u in all_urls if "cs.money/" in u and ("/2.0/" in u or "/1.0/" in u)]
|
|
if api:
|
|
(OUT_DIR / "_api_calls.txt").write_text("\n".join(dict.fromkeys(api)), encoding="utf-8")
|
|
print(f">>> {len(set(api))} cs.money API calls; saved to {OUT_DIR / '_api_calls.txt'}")
|
|
except Exception as ex:
|
|
print(f"(resource-timing query failed: {ex})")
|
|
|
|
# Dump the SSR'd page so we can see how the filter is encoded and where the
|
|
# listings data lives (Astro embeds island props / hydration JSON in the HTML).
|
|
try:
|
|
html = await page.evaluate("document.documentElement.outerHTML")
|
|
if isinstance(html, str) and html:
|
|
(OUT_DIR / "_page.html").write_text(html, encoding="utf-8")
|
|
print(f">>> saved page HTML ({len(html)} bytes) to {OUT_DIR / '_page.html'}")
|
|
except Exception as ex:
|
|
print(f"(page HTML dump failed: {ex})")
|
|
|
|
# Discovery: what sell-orders request did the page actually make?
|
|
if _seen_urls:
|
|
captured = _seen_urls[-1]
|
|
template = template_from(captured)
|
|
print("\n>>> DISCOVERED sell-orders API call the page fired:")
|
|
print(f" {captured}")
|
|
print(f">>> pagination template: {template}\n")
|
|
# Persist it — the console line is easy to lose, and this is the one bit
|
|
# of ground truth (the real filter-param scheme) we need.
|
|
(OUT_DIR / "_discovered.txt").write_text(
|
|
"ALL captured sell-orders requests:\n"
|
|
+ "\n".join(dict.fromkeys(_seen_urls))
|
|
+ f"\n\npagination template:\n{template}\n",
|
|
encoding="utf-8")
|
|
print(f">>> saved to {OUT_DIR / '_discovered.txt'}")
|
|
else:
|
|
template = DEFAULT_TEMPLATE
|
|
if SEARCH:
|
|
template = template.replace("offset={}", f"search={quote_plus(SEARCH)}&offset={{}}")
|
|
print(f"\n(no request captured; falling back to template: {template})\n")
|
|
|
|
for i in range(PAGES):
|
|
offset = START_OFFSET + i * 60
|
|
status, body = await fetch_json(page, template.format(offset))
|
|
|
|
if looks_like_challenge(body):
|
|
print(f" page {i + 1} [offset {offset}]: RE-CHALLENGED (status {status}). Stopping.")
|
|
(OUT_DIR / f"{tag}_challenge_offset_{offset}.html").write_text(body, encoding="utf-8")
|
|
reason = f"re-challenged at offset {offset}"
|
|
break
|
|
|
|
try:
|
|
items = json.loads(body).get("items", [])
|
|
except json.JSONDecodeError:
|
|
print(f" page {i + 1} [offset {offset}]: non-JSON (status {status}). Stopping.")
|
|
reason = f"non-JSON at offset {offset}"
|
|
break
|
|
|
|
if not items:
|
|
print(f" page {i + 1} [offset {offset}]: 0 items — end of results.")
|
|
reason = "end of results"
|
|
break
|
|
|
|
(OUT_DIR / f"{tag}_offset_{offset:06d}.json").write_text(body, encoding="utf-8")
|
|
pages_ok += 1
|
|
deepest_offset = offset
|
|
items_total += len(items)
|
|
names = set()
|
|
for it in items:
|
|
fl = it.get("asset", {}).get("float")
|
|
if fl is not None:
|
|
floats_total += 1
|
|
d = decimals(fl)
|
|
dp_min, dp_max = min(dp_min, d), max(dp_max, d)
|
|
if d <= 6: # short repr — exact binary fraction (e.g. 1/16), not truncation
|
|
low_prec += 1
|
|
names.add(it.get("asset", {}).get("names", {}).get("full"))
|
|
sample = next(iter(names), None) if SEARCH else None
|
|
print(f" page {i + 1} [offset {offset}] OK — {len(items)} items"
|
|
+ (f" (e.g. {sample}; {len(names)} distinct names)" if SEARCH else ""))
|
|
|
|
await page.sleep(DELAY + random.uniform(0, JITTER))
|
|
|
|
print("\n=== summary ===")
|
|
print(f" query: {SEARCH or '(whole market)'}")
|
|
print(f" stopped: {reason}")
|
|
print(f" clean pages: {pages_ok} deepest offset: {deepest_offset} items: {items_total}")
|
|
if floats_total:
|
|
# Truncation would make MANY values short, not one exact binary fraction.
|
|
verdict = "FULL precision" if low_prec / floats_total < 0.02 else "POSSIBLE TRUNCATION"
|
|
print(f" floats: {floats_total} items, {dp_max}-decimal max, "
|
|
f"{low_prec} short-repr (exact fractions) — {verdict}")
|
|
print(f" files in {OUT_DIR}")
|
|
finally:
|
|
browser.stop()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
uc.loop().run_until_complete(main())
|