Add cs.money worker stack with per-worker IPRoyal residential proxy
Brings up the pull-model scraper: the .NET C2 hands skin+wear jobs to Python nodriver workers that scrape cs.money and post results back, plus the supporting Core/EFCore data model, migrations, and docker-compose orchestration. IPRoyal proxying lets workers scale horizontally with a distinct residential exit IP each: every worker process mints its own sticky session at startup, and an in-process forwarding proxy injects the gateway auth so Chromium talks only to an auth-free localhost endpoint (zero CDP). On a Cloudflare challenge a worker rotates to a fresh session/IP and re-warms. Verified end-to-end against live IPRoyal: distinct US residential exits per worker and IP rotation on demand. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
77
worker/verify_count.py
Normal file
77
worker/verify_count.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""
|
||||
One-off count verification: scrape a single skin+wear search from cs.money and
|
||||
report how many distinct sell-orders come back, reusing the production worker's
|
||||
warm-session + price-window bisection logic (worker.scrape_job).
|
||||
|
||||
Use it to sanity-check that our pagination actually recovers the FULL listing
|
||||
count cs.money shows on the site (the known ground truth) for one query.
|
||||
|
||||
cd worker
|
||||
.venv\\Scripts\\Activate.ps1
|
||||
python verify_count.py "Desert Eagle Bronze Deco fn"
|
||||
|
||||
Env knobs (same meaning as worker.py): SOLVE_SECONDS, DELAY, JITTER, PROXY,
|
||||
BROWSER_PATH, LOAD_IMAGES. MAX_FETCHES caps window fetches (default 80).
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from collections import Counter
|
||||
|
||||
import nodriver as uc
|
||||
|
||||
import worker
|
||||
|
||||
MAX_FETCHES = int(os.environ.get("MAX_FETCHES", "80"))
|
||||
|
||||
|
||||
async def main():
|
||||
search = " ".join(sys.argv[1:]) or "Desert Eagle Bronze Deco fn"
|
||||
|
||||
args = [f"--proxy-server={worker.PROXY}"] if worker.PROXY else []
|
||||
if not worker.LOAD_IMAGES:
|
||||
args.append("--blink-settings=imagesEnabled=false")
|
||||
if os.environ.get("CHROME_NO_SANDBOX") == "1":
|
||||
args += ["--no-sandbox", "--disable-dev-shm-usage"]
|
||||
|
||||
print(f"Verifying count for search {search!r} (proxy={worker.PROXY or 'own IP'})")
|
||||
browser = await uc.start(
|
||||
headless=False, browser_executable_path=worker.BROWSER_PATH, browser_args=args)
|
||||
try:
|
||||
page = await browser.get("about:blank")
|
||||
await worker.warm(page)
|
||||
|
||||
job = {"search": search, "maxPages": MAX_FETCHES}
|
||||
items, fetches, reason = await worker.scrape_job(page, job)
|
||||
|
||||
print("\n=== result ===")
|
||||
print(f" search: {search}")
|
||||
print(f" stopped: {reason}")
|
||||
print(f" fetches: {fetches}")
|
||||
print(f" DISTINCT sell-orders (deduped by id): {len(items)}")
|
||||
|
||||
# Break down what came back so we can see whether the count is inflated by
|
||||
# off-target names/wears (the C2's name+wear filter would drop those later).
|
||||
names = Counter()
|
||||
wears = Counter()
|
||||
st = 0
|
||||
for it in items:
|
||||
asset = it.get("asset") or {}
|
||||
names[(asset.get("names") or {}).get("full")] += 1
|
||||
wears[asset.get("quality")] += 1
|
||||
if asset.get("isStatTrak"):
|
||||
st += 1
|
||||
print(f" StatTrak in set: {st}")
|
||||
print(" by name:")
|
||||
for name, n in names.most_common():
|
||||
print(f" {n:4d} {name}")
|
||||
print(" by wear (quality code):")
|
||||
for w, n in wears.most_common():
|
||||
print(f" {n:4d} {w}")
|
||||
finally:
|
||||
browser.stop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
uc.loop().run_until_complete(main())
|
||||
Reference in New Issue
Block a user