Add cs.money worker stack with per-worker IPRoyal residential proxy
Brings up the pull-model scraper: the .NET C2 hands skin+wear jobs to Python nodriver workers that scrape cs.money and post results back, plus the supporting Core/EFCore data model, migrations, and docker-compose orchestration. IPRoyal proxying lets workers scale horizontally with a distinct residential exit IP each: every worker process mints its own sticky session at startup, and an in-process forwarding proxy injects the gateway auth so Chromium talks only to an auth-free localhost endpoint (zero CDP). On a Cloudflare challenge a worker rotates to a fresh session/IP and re-warms. Verified end-to-end against live IPRoyal: distinct US residential exits per worker and IP rotation on demand. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
183
worker/discover_pagination.py
Normal file
183
worker/discover_pagination.py
Normal file
@@ -0,0 +1,183 @@
|
||||
"""
|
||||
Discover how cs.money paginates a filtered search past the initial ~60 SSR items.
|
||||
|
||||
Tests two hypotheses against a high-result search (default "ak-47 redline", which has
|
||||
well over 60 listings):
|
||||
|
||||
A. Does the SSR page honor offset/limit in the URL? Fetch ?search=...&offset=60 and
|
||||
?search=...&limit=120 and compare item ids to page 1. If disjoint/larger, we can
|
||||
paginate cheaply by re-fetching the page.
|
||||
B. The real client "load more": scroll hard to trigger lazy-load and capture any
|
||||
cs.money /2.0/ XHR via Resource Timing — that request carries the structured
|
||||
filter params + offset, i.e. a lighter direct-API pagination path.
|
||||
|
||||
Findings are printed and saved to captures/_pagination.txt.
|
||||
|
||||
cd worker; .venv\\Scripts\\Activate.ps1
|
||||
python discover_pagination.py
|
||||
$env:SEARCH="ak-47 redline"; python discover_pagination.py # override the search
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import pathlib
|
||||
import re
|
||||
|
||||
import nodriver as uc
|
||||
from nodriver import cdp
|
||||
|
||||
SEARCH = os.environ.get("SEARCH", "ak-47 redline")
|
||||
SOLVE_SECONDS = int(os.environ.get("SOLVE_SECONDS", "30"))
|
||||
BROWSER_PATH = os.environ.get("BROWSER_PATH")
|
||||
PROXY = os.environ.get("PROXY")
|
||||
|
||||
BASE = "https://cs.money/market/buy/"
|
||||
PAGE_PARAMS_RE = re.compile(r'<script\b[^>]*id="__page-params"[^>]*>(.*?)</script>', re.S)
|
||||
OUT = pathlib.Path(__file__).parent / "captures"
|
||||
CONSENT = ["Reject all", "Only necessary", "Reject", "Decline", "Deny"]
|
||||
|
||||
# Aggressive scroll: window + every scrollable container (the grid scrolls in a div,
|
||||
# which is why a plain window.scrollTo didn't trigger lazy-load before).
|
||||
SCROLL_JS = (
|
||||
"window.scrollTo(0, document.body.scrollHeight);"
|
||||
"document.querySelectorAll('*').forEach(e=>{"
|
||||
" if (e.scrollHeight > e.clientHeight + 80) e.scrollTop = e.scrollHeight;});")
|
||||
|
||||
|
||||
async def js(page, expr):
|
||||
raw = await page.evaluate(f"JSON.stringify({expr})")
|
||||
try:
|
||||
return json.loads(raw) if isinstance(raw, str) else None
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return None
|
||||
|
||||
|
||||
async def fetch_text(page, url):
|
||||
expr = (f"fetch({url!r},{{credentials:'include'}}).then(async r=>"
|
||||
f"JSON.stringify({{status:r.status, body:await r.text()}}))")
|
||||
raw = await page.evaluate(expr, await_promise=True)
|
||||
try:
|
||||
o = json.loads(raw)
|
||||
return o.get("status"), o.get("body", "")
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return None, ""
|
||||
|
||||
|
||||
def page_item_ids(html):
|
||||
m = PAGE_PARAMS_RE.search(html or "")
|
||||
if not m:
|
||||
return []
|
||||
try:
|
||||
return [it.get("id") for it in json.loads(m.group(1)).get("inventory", {}).get("items", [])]
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
|
||||
|
||||
async def click_visible(page, pattern):
|
||||
"""Click the first VISIBLE element whose trimmed text matches `pattern` (case-
|
||||
insensitive). nodriver's find() was matching hidden/duplicate nodes; restricting
|
||||
to offsetParent!=null + short text hits the real button."""
|
||||
expr = ("JSON.stringify((()=>{"
|
||||
"const re=new RegExp(" + json.dumps(pattern) + ",'i');"
|
||||
"const els=[...document.querySelectorAll('button,a,[role=\"button\"],span,div')];"
|
||||
"const b=els.find(e=>e.offsetParent!==null && (e.textContent||'').trim().length<40 "
|
||||
"&& re.test((e.textContent||'').trim()));"
|
||||
"if(b){b.click();return true}return false})())")
|
||||
r = await page.evaluate(expr)
|
||||
return isinstance(r, str) and "true" in r
|
||||
|
||||
|
||||
async def banner_present(page):
|
||||
r = await page.evaluate(
|
||||
"JSON.stringify(/Manage cookies|Accept all/i.test(document.body.innerText||''))")
|
||||
return isinstance(r, str) and "true" in r
|
||||
|
||||
|
||||
async def dismiss(page):
|
||||
"""Privacy-preserving first (Manage -> Reject all -> Confirm); if the banner is
|
||||
still up, fall back to Accept all so the page becomes interactive (discovery
|
||||
needs scrolling to work)."""
|
||||
steps = []
|
||||
if await click_visible(page, "manage cookies|^manage$"):
|
||||
steps.append("manage")
|
||||
await page.sleep(1.2)
|
||||
if await click_visible(page, "reject all"):
|
||||
steps.append("reject-all")
|
||||
await page.sleep(0.4)
|
||||
for c in ("confirm my choice", "^confirm$", "^save$"):
|
||||
if await click_visible(page, c):
|
||||
steps.append("confirm")
|
||||
break
|
||||
await page.sleep(1)
|
||||
if await banner_present(page):
|
||||
steps.append("still-up->accept" if await click_visible(page, "accept all|^accept$") else "still-up")
|
||||
await page.sleep(0.5)
|
||||
steps.append("gone" if not await banner_present(page) else "STILL-PRESENT")
|
||||
return ", ".join(steps)
|
||||
|
||||
|
||||
async def main():
|
||||
OUT.mkdir(exist_ok=True)
|
||||
args = [f"--proxy-server={PROXY}"] if PROXY else []
|
||||
args.append("--blink-settings=imagesEnabled=false")
|
||||
from urllib.parse import quote_plus
|
||||
q = quote_plus(SEARCH)
|
||||
findings = []
|
||||
|
||||
browser = await uc.start(headless=False, browser_executable_path=BROWSER_PATH, browser_args=args)
|
||||
try:
|
||||
url0 = f"{BASE}?search={q}"
|
||||
page = await browser.get(url0)
|
||||
print(f"Warming on {url0} ({SOLVE_SECONDS}s for Cloudflare)...")
|
||||
await page.sleep(SOLVE_SECONDS)
|
||||
print(f"Consent: {await dismiss(page)}")
|
||||
|
||||
# --- A. URL offset/limit on the SSR page ---
|
||||
_, h0 = await fetch_text(page, f"{BASE}?search={q}")
|
||||
_, h1 = await fetch_text(page, f"{BASE}?search={q}&offset=60")
|
||||
_, h2 = await fetch_text(page, f"{BASE}?search={q}&limit=120")
|
||||
a, b, c = page_item_ids(h0), page_item_ids(h1), page_item_ids(h2)
|
||||
overlap = len(set(a) & set(b))
|
||||
findings.append(f"page1 ids={len(a)} offset=60 ids={len(b)} (overlap with page1={overlap}) limit=120 ids={len(c)}")
|
||||
findings.append(f" -> offset works? {'YES (disjoint)' if b and overlap == 0 else 'no/ignored'}")
|
||||
findings.append(f" -> limit works? {'YES (>60)' if len(c) > 60 else 'no/ignored'}")
|
||||
|
||||
# --- B. Trigger client load-more, capture cs.money /2.0/ XHRs ---
|
||||
# Infinite scroll only fires on GRADUAL downward scrolling — jumping to the
|
||||
# bottom skips the trigger. So step down in small wheel increments and watch
|
||||
# the item count grow.
|
||||
before = set(await js(page, "performance.getEntriesByType('resource').map(e=>e.name)") or [])
|
||||
async def card_count():
|
||||
n = await page.evaluate(
|
||||
"JSON.stringify(document.querySelectorAll('[href*=\"/item/\"],[class*=\"item\" i]').length)")
|
||||
return n
|
||||
print(f" cards before scroll: {await card_count()}")
|
||||
for step in range(60):
|
||||
try:
|
||||
await page.send(cdp.input_.dispatch_mouse_event(
|
||||
type_="mouseWheel", x=720, y=450, delta_x=0, delta_y=500))
|
||||
except Exception:
|
||||
pass
|
||||
await page.sleep(0.7)
|
||||
if step % 15 == 14:
|
||||
now = [u for u in (await js(page, "performance.getEntriesByType('resource').map(e=>e.name)") or [])
|
||||
if u not in before and "cs.money" in u and "metrics." not in u and "traces." not in u]
|
||||
print(f" step {step+1}: cards={await card_count()} new cs.money reqs={len(now)}")
|
||||
after = await js(page, "performance.getEntriesByType('resource').map(e=>e.name)") or []
|
||||
new_xhrs = [u for u in after if u not in before and "cs.money" in u
|
||||
and "metrics." not in u and "traces." not in u]
|
||||
findings.append(f"\nclient requests after scrolling ({len(new_xhrs)} new cs.money):")
|
||||
findings.extend(f" {u}" for u in dict.fromkeys(new_xhrs))
|
||||
if not new_xhrs:
|
||||
findings.append(" (none — grid may not lazy-load via XHR, or scroll didn't reach the trigger)")
|
||||
|
||||
report = "\n".join(findings)
|
||||
print("\n=== FINDINGS ===\n" + report)
|
||||
(OUT / "_pagination.txt").write_text(f"search: {SEARCH}\n\n{report}\n", encoding="utf-8")
|
||||
print(f"\nsaved to {OUT / '_pagination.txt'}")
|
||||
finally:
|
||||
browser.stop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
uc.loop().run_until_complete(main())
|
||||
Reference in New Issue
Block a user