Brings up the pull-model scraper: the .NET C2 hands skin+wear jobs to Python nodriver workers that scrape cs.money and post results back, plus the supporting Core/EFCore data model, migrations, and docker-compose orchestration. IPRoyal proxying lets workers scale horizontally with a distinct residential exit IP each: every worker process mints its own sticky session at startup, and an in-process forwarding proxy injects the gateway auth so Chromium talks only to an auth-free localhost endpoint (zero CDP). On a Cloudflare challenge a worker rotates to a fresh session/IP and re-warms. Verified end-to-end against live IPRoyal: distinct US residential exits per worker and IP rotation on demand. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
184 lines
7.8 KiB
Python
184 lines
7.8 KiB
Python
"""
|
|
Discover how cs.money paginates a filtered search past the initial ~60 SSR items.
|
|
|
|
Tests two hypotheses against a high-result search (default "ak-47 redline", which has
|
|
well over 60 listings):
|
|
|
|
A. Does the SSR page honor offset/limit in the URL? Fetch ?search=...&offset=60 and
|
|
?search=...&limit=120 and compare item ids to page 1. If disjoint/larger, we can
|
|
paginate cheaply by re-fetching the page.
|
|
B. The real client "load more": scroll hard to trigger lazy-load and capture any
|
|
cs.money /2.0/ XHR via Resource Timing — that request carries the structured
|
|
filter params + offset, i.e. a lighter direct-API pagination path.
|
|
|
|
Findings are printed and saved to captures/_pagination.txt.
|
|
|
|
cd worker; .venv\\Scripts\\Activate.ps1
|
|
python discover_pagination.py
|
|
$env:SEARCH="ak-47 redline"; python discover_pagination.py # override the search
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import pathlib
|
|
import re
|
|
|
|
import nodriver as uc
|
|
from nodriver import cdp
|
|
|
|
SEARCH = os.environ.get("SEARCH", "ak-47 redline")
|
|
SOLVE_SECONDS = int(os.environ.get("SOLVE_SECONDS", "30"))
|
|
BROWSER_PATH = os.environ.get("BROWSER_PATH")
|
|
PROXY = os.environ.get("PROXY")
|
|
|
|
BASE = "https://cs.money/market/buy/"
|
|
PAGE_PARAMS_RE = re.compile(r'<script\b[^>]*id="__page-params"[^>]*>(.*?)</script>', re.S)
|
|
OUT = pathlib.Path(__file__).parent / "captures"
|
|
CONSENT = ["Reject all", "Only necessary", "Reject", "Decline", "Deny"]
|
|
|
|
# Aggressive scroll: window + every scrollable container (the grid scrolls in a div,
|
|
# which is why a plain window.scrollTo didn't trigger lazy-load before).
|
|
SCROLL_JS = (
|
|
"window.scrollTo(0, document.body.scrollHeight);"
|
|
"document.querySelectorAll('*').forEach(e=>{"
|
|
" if (e.scrollHeight > e.clientHeight + 80) e.scrollTop = e.scrollHeight;});")
|
|
|
|
|
|
async def js(page, expr):
|
|
raw = await page.evaluate(f"JSON.stringify({expr})")
|
|
try:
|
|
return json.loads(raw) if isinstance(raw, str) else None
|
|
except (json.JSONDecodeError, TypeError):
|
|
return None
|
|
|
|
|
|
async def fetch_text(page, url):
|
|
expr = (f"fetch({url!r},{{credentials:'include'}}).then(async r=>"
|
|
f"JSON.stringify({{status:r.status, body:await r.text()}}))")
|
|
raw = await page.evaluate(expr, await_promise=True)
|
|
try:
|
|
o = json.loads(raw)
|
|
return o.get("status"), o.get("body", "")
|
|
except (json.JSONDecodeError, TypeError):
|
|
return None, ""
|
|
|
|
|
|
def page_item_ids(html):
|
|
m = PAGE_PARAMS_RE.search(html or "")
|
|
if not m:
|
|
return []
|
|
try:
|
|
return [it.get("id") for it in json.loads(m.group(1)).get("inventory", {}).get("items", [])]
|
|
except json.JSONDecodeError:
|
|
return []
|
|
|
|
|
|
async def click_visible(page, pattern):
|
|
"""Click the first VISIBLE element whose trimmed text matches `pattern` (case-
|
|
insensitive). nodriver's find() was matching hidden/duplicate nodes; restricting
|
|
to offsetParent!=null + short text hits the real button."""
|
|
expr = ("JSON.stringify((()=>{"
|
|
"const re=new RegExp(" + json.dumps(pattern) + ",'i');"
|
|
"const els=[...document.querySelectorAll('button,a,[role=\"button\"],span,div')];"
|
|
"const b=els.find(e=>e.offsetParent!==null && (e.textContent||'').trim().length<40 "
|
|
"&& re.test((e.textContent||'').trim()));"
|
|
"if(b){b.click();return true}return false})())")
|
|
r = await page.evaluate(expr)
|
|
return isinstance(r, str) and "true" in r
|
|
|
|
|
|
async def banner_present(page):
|
|
r = await page.evaluate(
|
|
"JSON.stringify(/Manage cookies|Accept all/i.test(document.body.innerText||''))")
|
|
return isinstance(r, str) and "true" in r
|
|
|
|
|
|
async def dismiss(page):
|
|
"""Privacy-preserving first (Manage -> Reject all -> Confirm); if the banner is
|
|
still up, fall back to Accept all so the page becomes interactive (discovery
|
|
needs scrolling to work)."""
|
|
steps = []
|
|
if await click_visible(page, "manage cookies|^manage$"):
|
|
steps.append("manage")
|
|
await page.sleep(1.2)
|
|
if await click_visible(page, "reject all"):
|
|
steps.append("reject-all")
|
|
await page.sleep(0.4)
|
|
for c in ("confirm my choice", "^confirm$", "^save$"):
|
|
if await click_visible(page, c):
|
|
steps.append("confirm")
|
|
break
|
|
await page.sleep(1)
|
|
if await banner_present(page):
|
|
steps.append("still-up->accept" if await click_visible(page, "accept all|^accept$") else "still-up")
|
|
await page.sleep(0.5)
|
|
steps.append("gone" if not await banner_present(page) else "STILL-PRESENT")
|
|
return ", ".join(steps)
|
|
|
|
|
|
async def main():
|
|
OUT.mkdir(exist_ok=True)
|
|
args = [f"--proxy-server={PROXY}"] if PROXY else []
|
|
args.append("--blink-settings=imagesEnabled=false")
|
|
from urllib.parse import quote_plus
|
|
q = quote_plus(SEARCH)
|
|
findings = []
|
|
|
|
browser = await uc.start(headless=False, browser_executable_path=BROWSER_PATH, browser_args=args)
|
|
try:
|
|
url0 = f"{BASE}?search={q}"
|
|
page = await browser.get(url0)
|
|
print(f"Warming on {url0} ({SOLVE_SECONDS}s for Cloudflare)...")
|
|
await page.sleep(SOLVE_SECONDS)
|
|
print(f"Consent: {await dismiss(page)}")
|
|
|
|
# --- A. URL offset/limit on the SSR page ---
|
|
_, h0 = await fetch_text(page, f"{BASE}?search={q}")
|
|
_, h1 = await fetch_text(page, f"{BASE}?search={q}&offset=60")
|
|
_, h2 = await fetch_text(page, f"{BASE}?search={q}&limit=120")
|
|
a, b, c = page_item_ids(h0), page_item_ids(h1), page_item_ids(h2)
|
|
overlap = len(set(a) & set(b))
|
|
findings.append(f"page1 ids={len(a)} offset=60 ids={len(b)} (overlap with page1={overlap}) limit=120 ids={len(c)}")
|
|
findings.append(f" -> offset works? {'YES (disjoint)' if b and overlap == 0 else 'no/ignored'}")
|
|
findings.append(f" -> limit works? {'YES (>60)' if len(c) > 60 else 'no/ignored'}")
|
|
|
|
# --- B. Trigger client load-more, capture cs.money /2.0/ XHRs ---
|
|
# Infinite scroll only fires on GRADUAL downward scrolling — jumping to the
|
|
# bottom skips the trigger. So step down in small wheel increments and watch
|
|
# the item count grow.
|
|
before = set(await js(page, "performance.getEntriesByType('resource').map(e=>e.name)") or [])
|
|
async def card_count():
|
|
n = await page.evaluate(
|
|
"JSON.stringify(document.querySelectorAll('[href*=\"/item/\"],[class*=\"item\" i]').length)")
|
|
return n
|
|
print(f" cards before scroll: {await card_count()}")
|
|
for step in range(60):
|
|
try:
|
|
await page.send(cdp.input_.dispatch_mouse_event(
|
|
type_="mouseWheel", x=720, y=450, delta_x=0, delta_y=500))
|
|
except Exception:
|
|
pass
|
|
await page.sleep(0.7)
|
|
if step % 15 == 14:
|
|
now = [u for u in (await js(page, "performance.getEntriesByType('resource').map(e=>e.name)") or [])
|
|
if u not in before and "cs.money" in u and "metrics." not in u and "traces." not in u]
|
|
print(f" step {step+1}: cards={await card_count()} new cs.money reqs={len(now)}")
|
|
after = await js(page, "performance.getEntriesByType('resource').map(e=>e.name)") or []
|
|
new_xhrs = [u for u in after if u not in before and "cs.money" in u
|
|
and "metrics." not in u and "traces." not in u]
|
|
findings.append(f"\nclient requests after scrolling ({len(new_xhrs)} new cs.money):")
|
|
findings.extend(f" {u}" for u in dict.fromkeys(new_xhrs))
|
|
if not new_xhrs:
|
|
findings.append(" (none — grid may not lazy-load via XHR, or scroll didn't reach the trigger)")
|
|
|
|
report = "\n".join(findings)
|
|
print("\n=== FINDINGS ===\n" + report)
|
|
(OUT / "_pagination.txt").write_text(f"search: {SEARCH}\n\n{report}\n", encoding="utf-8")
|
|
print(f"\nsaved to {OUT / '_pagination.txt'}")
|
|
finally:
|
|
browser.stop()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
uc.loop().run_until_complete(main())
|