Add cs.money worker stack with per-worker IPRoyal residential proxy
Brings up the pull-model scraper: the .NET C2 hands skin+wear jobs to Python nodriver workers that scrape cs.money and post results back, plus the supporting Core/EFCore data model, migrations, and docker-compose orchestration. IPRoyal proxying lets workers scale horizontally with a distinct residential exit IP each: every worker process mints its own sticky session at startup, and an in-process forwarding proxy injects the gateway auth so Chromium talks only to an auth-free localhost endpoint (zero CDP). On a Cloudflare challenge a worker rotates to a fresh session/IP and re-warms. Verified end-to-end against live IPRoyal: distinct US residential exits per worker and IP rotation on demand. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
285
worker/poc.py
Normal file
285
worker/poc.py
Normal file
@@ -0,0 +1,285 @@
|
||||
"""
|
||||
Proof-of-concept / pre-fleet validation for the cs.money scraper.
|
||||
|
||||
Proves the things we need before building the C2 + worker fleet:
|
||||
1. nodriver clears cs.money's Cloudflare where .NET Selenium couldn't.
|
||||
2. a single WARM session can page the sell-orders API deeply without re-challenge.
|
||||
3. a free-text market search (e.g. "cyber security ft") can be turned into a
|
||||
filtered sell-orders API call — we DISCOVER the real API params by capturing the
|
||||
request the page itself fires, instead of guessing.
|
||||
|
||||
It opens the market (optionally a search URL) in a real non-headless Chromium, lets
|
||||
you clear Cloudflare, dismisses the cookie banner (privacy-preserving), captures the
|
||||
sell-orders request the page makes, then pages that API from inside the cleared page
|
||||
(same-origin fetch carries cf_clearance), pacing itself and stopping on re-challenge.
|
||||
|
||||
cd worker
|
||||
.venv\\Scripts\\Activate.ps1
|
||||
pip install -r requirements.txt
|
||||
|
||||
python poc.py # whole-market sweep
|
||||
$env:SEARCH="cyber security ft"; python poc.py # targeted: FT M4A4 Cyber Security
|
||||
|
||||
Env knobs (all optional):
|
||||
SEARCH free-text market search; when set, scrape only those results
|
||||
MARKET_URL market page base (default the buy market)
|
||||
SOLVE_SECONDS seconds to wait for you to clear Cloudflare (default 30)
|
||||
PAGES how many offset pages (60 each) to attempt (default 20)
|
||||
START_OFFSET first offset (default 0)
|
||||
DELAY / JITTER base + random seconds between fetches (default 2.0 / 1.5)
|
||||
PROXY host:port for an auth-free proxy (omit to use your own IP)
|
||||
BROWSER_PATH path to Chrome/Edge if auto-detect fails
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
import random
|
||||
from urllib.parse import quote_plus, urlsplit, parse_qsl, urlencode, urlunsplit
|
||||
|
||||
import nodriver as uc
|
||||
from nodriver import cdp
|
||||
|
||||
SEARCH = os.environ.get("SEARCH")
|
||||
MARKET_URL = os.environ.get("MARKET_URL", "https://cs.money/market/buy/")
|
||||
SOLVE_SECONDS = int(os.environ.get("SOLVE_SECONDS", "30"))
|
||||
PAGES = int(os.environ.get("PAGES", "20"))
|
||||
START_OFFSET = int(os.environ.get("START_OFFSET", "0"))
|
||||
DELAY = float(os.environ.get("DELAY", "2.0"))
|
||||
JITTER = float(os.environ.get("JITTER", "1.5"))
|
||||
PROXY = os.environ.get("PROXY")
|
||||
BROWSER_PATH = os.environ.get("BROWSER_PATH")
|
||||
|
||||
# Fallback template if we fail to capture the page's own request (offset = {}).
|
||||
DEFAULT_TEMPLATE = "https://cs.money/2.0/market/sell-orders?limit=60&offset={}"
|
||||
OUT_DIR = pathlib.Path(__file__).parent / "captures"
|
||||
CONSENT_LABELS = ["Reject all", "Reject All", "Only necessary", "Necessary only",
|
||||
"Reject", "Decline", "Deny"]
|
||||
|
||||
# Filled by the CDP network handler with sell-orders request URLs the page fires.
|
||||
_seen_urls: list[str] = []
|
||||
|
||||
|
||||
def looks_like_challenge(body: str) -> bool:
|
||||
s = (body or "").lstrip()
|
||||
return not s or s.startswith("<") or "Just a moment" in body or "challenge-platform" in body
|
||||
|
||||
|
||||
def decimals(v: float) -> int:
|
||||
r = repr(float(v))
|
||||
return len(r.split(".")[-1]) if "." in r else 0
|
||||
|
||||
|
||||
def template_from(url: str) -> str:
|
||||
"""Turn a captured sell-orders URL into a template with offset as '{}',
|
||||
preserving every other param (the search/filter encoding we want to learn)."""
|
||||
parts = urlsplit(url)
|
||||
q = [(k, v) for k, v in parse_qsl(parts.query, keep_blank_values=True) if k != "offset"]
|
||||
if not any(k == "limit" for k, _ in q):
|
||||
q.append(("limit", "60"))
|
||||
base_q = urlencode(q)
|
||||
new_q = (base_q + "&" if base_q else "") + "offset={}"
|
||||
return urlunsplit((parts.scheme, parts.netloc, parts.path, new_q, ""))
|
||||
|
||||
|
||||
async def dismiss_consent(page) -> str | None:
|
||||
"""Best-effort, privacy-preserving — never clicks 'Accept all'."""
|
||||
for label in CONSENT_LABELS:
|
||||
try:
|
||||
el = await page.find(label, best_match=True, timeout=2)
|
||||
except Exception:
|
||||
el = None
|
||||
if el:
|
||||
try:
|
||||
await el.click()
|
||||
return label
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
async def fetch_json(page, url: str) -> tuple[str, str]:
|
||||
expr = (
|
||||
f"fetch({url!r}, {{credentials:'include', headers:{{'accept':'application/json'}}}})"
|
||||
f".then(async r => JSON.stringify({{status: r.status, body: await r.text()}}))"
|
||||
)
|
||||
raw = await page.evaluate(expr, await_promise=True)
|
||||
if not isinstance(raw, str):
|
||||
return ("-1", "")
|
||||
try:
|
||||
obj = json.loads(raw)
|
||||
return (str(obj.get("status", "-1")), obj.get("body", ""))
|
||||
except json.JSONDecodeError:
|
||||
return ("-1", raw)
|
||||
|
||||
|
||||
async def main():
|
||||
OUT_DIR.mkdir(exist_ok=True)
|
||||
args = [f"--proxy-server={PROXY}"] if PROXY else []
|
||||
|
||||
target_url = MARKET_URL
|
||||
tag = "market"
|
||||
if SEARCH:
|
||||
sep = "&" if "?" in MARKET_URL else "?"
|
||||
target_url = f"{MARKET_URL}{sep}search={quote_plus(SEARCH)}"
|
||||
tag = "search_" + "".join(c if c.isalnum() else "_" for c in SEARCH)[:40]
|
||||
|
||||
print(f"Launching nodriver Chromium (proxy={PROXY or 'none / own IP'})...")
|
||||
browser = await uc.start(headless=False, browser_executable_path=BROWSER_PATH, browser_args=args)
|
||||
|
||||
pages_ok = items_total = floats_total = low_prec = 0
|
||||
dp_min, dp_max = 99, 0
|
||||
deepest_offset = None
|
||||
reason = "completed (hit PAGES limit)"
|
||||
|
||||
try:
|
||||
# Open a blank tab first so the network handler is attached BEFORE the page
|
||||
# fires its filtered sell-orders request (otherwise we'd miss it).
|
||||
page = await browser.get("about:blank")
|
||||
|
||||
async def on_request(evt):
|
||||
url = evt.request.url
|
||||
if "/market/sell-orders" in url:
|
||||
_seen_urls.append(url)
|
||||
|
||||
page.add_handler(cdp.network.RequestWillBeSent, on_request)
|
||||
try:
|
||||
await page.send(cdp.network.enable())
|
||||
except Exception as ex:
|
||||
print(f"(network capture unavailable: {ex})")
|
||||
|
||||
print(f"Opening {target_url}")
|
||||
await page.get(target_url)
|
||||
print(f"Solve any Cloudflare challenge. Waiting {SOLVE_SECONDS}s for the grid...")
|
||||
await page.sleep(SOLVE_SECONDS)
|
||||
|
||||
clicked = await dismiss_consent(page)
|
||||
print(f"Consent banner: {'dismissed via ' + clicked if clicked else 'left up (does not block fetch)'}")
|
||||
|
||||
# Reliable discovery via the Resource Timing API: the browser records EVERY
|
||||
# request the page made, so we read the real sell-orders URL straight out of it
|
||||
# (no flaky CDP event timing). Also dump nearby API calls for context.
|
||||
# cs.money is an Astro SSR app — the initial filtered listings are rendered
|
||||
# server-side (no client XHR to capture). Scroll to provoke lazy-load
|
||||
# pagination, which DOES fire a client request carrying the real filter params.
|
||||
print("Scrolling to trigger lazy-load pagination...")
|
||||
for _ in range(6):
|
||||
try:
|
||||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
||||
except Exception:
|
||||
pass
|
||||
await page.sleep(2)
|
||||
|
||||
# nodriver returns arrays unreliably from evaluate(), so JSON.stringify in JS
|
||||
# and json.loads here (the string path is proven by fetch_json).
|
||||
async def js_list(expr: str) -> list:
|
||||
raw = await page.evaluate(f"JSON.stringify({expr})")
|
||||
try:
|
||||
return json.loads(raw) if isinstance(raw, str) else []
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return []
|
||||
|
||||
try:
|
||||
all_urls = await js_list("performance.getEntriesByType('resource').map(e=>e.name)")
|
||||
print(f">>> Resource Timing saw {len(all_urls)} requests total")
|
||||
if all_urls:
|
||||
(OUT_DIR / "_all_requests.txt").write_text(
|
||||
"\n".join(dict.fromkeys(all_urls)), encoding="utf-8")
|
||||
sell = [u for u in all_urls if "/market/sell-orders" in u]
|
||||
_seen_urls.extend(sell)
|
||||
api = [u for u in all_urls if "cs.money/" in u and ("/2.0/" in u or "/1.0/" in u)]
|
||||
if api:
|
||||
(OUT_DIR / "_api_calls.txt").write_text("\n".join(dict.fromkeys(api)), encoding="utf-8")
|
||||
print(f">>> {len(set(api))} cs.money API calls; saved to {OUT_DIR / '_api_calls.txt'}")
|
||||
except Exception as ex:
|
||||
print(f"(resource-timing query failed: {ex})")
|
||||
|
||||
# Dump the SSR'd page so we can see how the filter is encoded and where the
|
||||
# listings data lives (Astro embeds island props / hydration JSON in the HTML).
|
||||
try:
|
||||
html = await page.evaluate("document.documentElement.outerHTML")
|
||||
if isinstance(html, str) and html:
|
||||
(OUT_DIR / "_page.html").write_text(html, encoding="utf-8")
|
||||
print(f">>> saved page HTML ({len(html)} bytes) to {OUT_DIR / '_page.html'}")
|
||||
except Exception as ex:
|
||||
print(f"(page HTML dump failed: {ex})")
|
||||
|
||||
# Discovery: what sell-orders request did the page actually make?
|
||||
if _seen_urls:
|
||||
captured = _seen_urls[-1]
|
||||
template = template_from(captured)
|
||||
print("\n>>> DISCOVERED sell-orders API call the page fired:")
|
||||
print(f" {captured}")
|
||||
print(f">>> pagination template: {template}\n")
|
||||
# Persist it — the console line is easy to lose, and this is the one bit
|
||||
# of ground truth (the real filter-param scheme) we need.
|
||||
(OUT_DIR / "_discovered.txt").write_text(
|
||||
"ALL captured sell-orders requests:\n"
|
||||
+ "\n".join(dict.fromkeys(_seen_urls))
|
||||
+ f"\n\npagination template:\n{template}\n",
|
||||
encoding="utf-8")
|
||||
print(f">>> saved to {OUT_DIR / '_discovered.txt'}")
|
||||
else:
|
||||
template = DEFAULT_TEMPLATE
|
||||
if SEARCH:
|
||||
template = template.replace("offset={}", f"search={quote_plus(SEARCH)}&offset={{}}")
|
||||
print(f"\n(no request captured; falling back to template: {template})\n")
|
||||
|
||||
for i in range(PAGES):
|
||||
offset = START_OFFSET + i * 60
|
||||
status, body = await fetch_json(page, template.format(offset))
|
||||
|
||||
if looks_like_challenge(body):
|
||||
print(f" page {i + 1} [offset {offset}]: RE-CHALLENGED (status {status}). Stopping.")
|
||||
(OUT_DIR / f"{tag}_challenge_offset_{offset}.html").write_text(body, encoding="utf-8")
|
||||
reason = f"re-challenged at offset {offset}"
|
||||
break
|
||||
|
||||
try:
|
||||
items = json.loads(body).get("items", [])
|
||||
except json.JSONDecodeError:
|
||||
print(f" page {i + 1} [offset {offset}]: non-JSON (status {status}). Stopping.")
|
||||
reason = f"non-JSON at offset {offset}"
|
||||
break
|
||||
|
||||
if not items:
|
||||
print(f" page {i + 1} [offset {offset}]: 0 items — end of results.")
|
||||
reason = "end of results"
|
||||
break
|
||||
|
||||
(OUT_DIR / f"{tag}_offset_{offset:06d}.json").write_text(body, encoding="utf-8")
|
||||
pages_ok += 1
|
||||
deepest_offset = offset
|
||||
items_total += len(items)
|
||||
names = set()
|
||||
for it in items:
|
||||
fl = it.get("asset", {}).get("float")
|
||||
if fl is not None:
|
||||
floats_total += 1
|
||||
d = decimals(fl)
|
||||
dp_min, dp_max = min(dp_min, d), max(dp_max, d)
|
||||
if d <= 6: # short repr — exact binary fraction (e.g. 1/16), not truncation
|
||||
low_prec += 1
|
||||
names.add(it.get("asset", {}).get("names", {}).get("full"))
|
||||
sample = next(iter(names), None) if SEARCH else None
|
||||
print(f" page {i + 1} [offset {offset}] OK — {len(items)} items"
|
||||
+ (f" (e.g. {sample}; {len(names)} distinct names)" if SEARCH else ""))
|
||||
|
||||
await page.sleep(DELAY + random.uniform(0, JITTER))
|
||||
|
||||
print("\n=== summary ===")
|
||||
print(f" query: {SEARCH or '(whole market)'}")
|
||||
print(f" stopped: {reason}")
|
||||
print(f" clean pages: {pages_ok} deepest offset: {deepest_offset} items: {items_total}")
|
||||
if floats_total:
|
||||
# Truncation would make MANY values short, not one exact binary fraction.
|
||||
verdict = "FULL precision" if low_prec / floats_total < 0.02 else "POSSIBLE TRUNCATION"
|
||||
print(f" floats: {floats_total} items, {dp_max}-decimal max, "
|
||||
f"{low_prec} short-repr (exact fractions) — {verdict}")
|
||||
print(f" files in {OUT_DIR}")
|
||||
finally:
|
||||
browser.stop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
uc.loop().run_until_complete(main())
|
||||
Reference in New Issue
Block a user