Files
Operation-Blue-Laminate-v2/worker/discover_pagination.py
bob dc7c3f99ae Add cs.money worker stack with per-worker IPRoyal residential proxy
Brings up the pull-model scraper: the .NET C2 hands skin+wear jobs to Python nodriver workers that scrape cs.money and post results back, plus the supporting Core/EFCore data model, migrations, and docker-compose orchestration.

IPRoyal proxying lets workers scale horizontally with a distinct residential exit IP each: every worker process mints its own sticky session at startup, and an in-process forwarding proxy injects the gateway auth so Chromium talks only to an auth-free localhost endpoint (zero CDP). On a Cloudflare challenge a worker rotates to a fresh session/IP and re-warms. Verified end-to-end against live IPRoyal: distinct US residential exits per worker and IP rotation on demand.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-05-31 15:03:53 -05:00

184 lines
7.8 KiB
Python

"""
Discover how cs.money paginates a filtered search past the initial ~60 SSR items.
Tests two hypotheses against a high-result search (default "ak-47 redline", which has
well over 60 listings):
A. Does the SSR page honor offset/limit in the URL? Fetch ?search=...&offset=60 and
?search=...&limit=120 and compare item ids to page 1. If disjoint/larger, we can
paginate cheaply by re-fetching the page.
B. The real client "load more": scroll hard to trigger lazy-load and capture any
cs.money /2.0/ XHR via Resource Timing — that request carries the structured
filter params + offset, i.e. a lighter direct-API pagination path.
Findings are printed and saved to captures/_pagination.txt.
cd worker; .venv\\Scripts\\Activate.ps1
python discover_pagination.py
$env:SEARCH="ak-47 redline"; python discover_pagination.py # override the search
"""
import json
import os
import pathlib
import re
import nodriver as uc
from nodriver import cdp
SEARCH = os.environ.get("SEARCH", "ak-47 redline")
SOLVE_SECONDS = int(os.environ.get("SOLVE_SECONDS", "30"))
BROWSER_PATH = os.environ.get("BROWSER_PATH")
PROXY = os.environ.get("PROXY")
BASE = "https://cs.money/market/buy/"
PAGE_PARAMS_RE = re.compile(r'<script\b[^>]*id="__page-params"[^>]*>(.*?)</script>', re.S)
OUT = pathlib.Path(__file__).parent / "captures"
CONSENT = ["Reject all", "Only necessary", "Reject", "Decline", "Deny"]
# Aggressive scroll: window + every scrollable container (the grid scrolls in a div,
# which is why a plain window.scrollTo didn't trigger lazy-load before).
SCROLL_JS = (
"window.scrollTo(0, document.body.scrollHeight);"
"document.querySelectorAll('*').forEach(e=>{"
" if (e.scrollHeight > e.clientHeight + 80) e.scrollTop = e.scrollHeight;});")
async def js(page, expr):
raw = await page.evaluate(f"JSON.stringify({expr})")
try:
return json.loads(raw) if isinstance(raw, str) else None
except (json.JSONDecodeError, TypeError):
return None
async def fetch_text(page, url):
expr = (f"fetch({url!r},{{credentials:'include'}}).then(async r=>"
f"JSON.stringify({{status:r.status, body:await r.text()}}))")
raw = await page.evaluate(expr, await_promise=True)
try:
o = json.loads(raw)
return o.get("status"), o.get("body", "")
except (json.JSONDecodeError, TypeError):
return None, ""
def page_item_ids(html):
m = PAGE_PARAMS_RE.search(html or "")
if not m:
return []
try:
return [it.get("id") for it in json.loads(m.group(1)).get("inventory", {}).get("items", [])]
except json.JSONDecodeError:
return []
async def click_visible(page, pattern):
"""Click the first VISIBLE element whose trimmed text matches `pattern` (case-
insensitive). nodriver's find() was matching hidden/duplicate nodes; restricting
to offsetParent!=null + short text hits the real button."""
expr = ("JSON.stringify((()=>{"
"const re=new RegExp(" + json.dumps(pattern) + ",'i');"
"const els=[...document.querySelectorAll('button,a,[role=\"button\"],span,div')];"
"const b=els.find(e=>e.offsetParent!==null && (e.textContent||'').trim().length<40 "
"&& re.test((e.textContent||'').trim()));"
"if(b){b.click();return true}return false})())")
r = await page.evaluate(expr)
return isinstance(r, str) and "true" in r
async def banner_present(page):
r = await page.evaluate(
"JSON.stringify(/Manage cookies|Accept all/i.test(document.body.innerText||''))")
return isinstance(r, str) and "true" in r
async def dismiss(page):
"""Privacy-preserving first (Manage -> Reject all -> Confirm); if the banner is
still up, fall back to Accept all so the page becomes interactive (discovery
needs scrolling to work)."""
steps = []
if await click_visible(page, "manage cookies|^manage$"):
steps.append("manage")
await page.sleep(1.2)
if await click_visible(page, "reject all"):
steps.append("reject-all")
await page.sleep(0.4)
for c in ("confirm my choice", "^confirm$", "^save$"):
if await click_visible(page, c):
steps.append("confirm")
break
await page.sleep(1)
if await banner_present(page):
steps.append("still-up->accept" if await click_visible(page, "accept all|^accept$") else "still-up")
await page.sleep(0.5)
steps.append("gone" if not await banner_present(page) else "STILL-PRESENT")
return ", ".join(steps)
async def main():
OUT.mkdir(exist_ok=True)
args = [f"--proxy-server={PROXY}"] if PROXY else []
args.append("--blink-settings=imagesEnabled=false")
from urllib.parse import quote_plus
q = quote_plus(SEARCH)
findings = []
browser = await uc.start(headless=False, browser_executable_path=BROWSER_PATH, browser_args=args)
try:
url0 = f"{BASE}?search={q}"
page = await browser.get(url0)
print(f"Warming on {url0} ({SOLVE_SECONDS}s for Cloudflare)...")
await page.sleep(SOLVE_SECONDS)
print(f"Consent: {await dismiss(page)}")
# --- A. URL offset/limit on the SSR page ---
_, h0 = await fetch_text(page, f"{BASE}?search={q}")
_, h1 = await fetch_text(page, f"{BASE}?search={q}&offset=60")
_, h2 = await fetch_text(page, f"{BASE}?search={q}&limit=120")
a, b, c = page_item_ids(h0), page_item_ids(h1), page_item_ids(h2)
overlap = len(set(a) & set(b))
findings.append(f"page1 ids={len(a)} offset=60 ids={len(b)} (overlap with page1={overlap}) limit=120 ids={len(c)}")
findings.append(f" -> offset works? {'YES (disjoint)' if b and overlap == 0 else 'no/ignored'}")
findings.append(f" -> limit works? {'YES (>60)' if len(c) > 60 else 'no/ignored'}")
# --- B. Trigger client load-more, capture cs.money /2.0/ XHRs ---
# Infinite scroll only fires on GRADUAL downward scrolling — jumping to the
# bottom skips the trigger. So step down in small wheel increments and watch
# the item count grow.
before = set(await js(page, "performance.getEntriesByType('resource').map(e=>e.name)") or [])
async def card_count():
n = await page.evaluate(
"JSON.stringify(document.querySelectorAll('[href*=\"/item/\"],[class*=\"item\" i]').length)")
return n
print(f" cards before scroll: {await card_count()}")
for step in range(60):
try:
await page.send(cdp.input_.dispatch_mouse_event(
type_="mouseWheel", x=720, y=450, delta_x=0, delta_y=500))
except Exception:
pass
await page.sleep(0.7)
if step % 15 == 14:
now = [u for u in (await js(page, "performance.getEntriesByType('resource').map(e=>e.name)") or [])
if u not in before and "cs.money" in u and "metrics." not in u and "traces." not in u]
print(f" step {step+1}: cards={await card_count()} new cs.money reqs={len(now)}")
after = await js(page, "performance.getEntriesByType('resource').map(e=>e.name)") or []
new_xhrs = [u for u in after if u not in before and "cs.money" in u
and "metrics." not in u and "traces." not in u]
findings.append(f"\nclient requests after scrolling ({len(new_xhrs)} new cs.money):")
findings.extend(f" {u}" for u in dict.fromkeys(new_xhrs))
if not new_xhrs:
findings.append(" (none — grid may not lazy-load via XHR, or scroll didn't reach the trigger)")
report = "\n".join(findings)
print("\n=== FINDINGS ===\n" + report)
(OUT / "_pagination.txt").write_text(f"search: {SEARCH}\n\n{report}\n", encoding="utf-8")
print(f"\nsaved to {OUT / '_pagination.txt'}")
finally:
browser.stop()
if __name__ == "__main__":
uc.loop().run_until_complete(main())