Cut metered-proxy bandwidth: re-sweep floor + wire-size logging
JobQueue now skips bands swept within MinResweepHours (config, default 6h) instead of re-scraping the whole catalogue continuously — the dominant cost on the metered residential proxy. Roughly linear savings with no data loss (full pagination retained); 0 disables it. Worker logs the real compressed transferSize per job (what the proxy bills) rather than the ~6.5x-larger decompressed length, so spend is visible. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -283,19 +283,28 @@ async def post_result(job_id, payload):
|
||||
|
||||
# --- scraping ---------------------------------------------------------------------
|
||||
|
||||
async def fetch_json(page, url: str) -> tuple[str, str]:
|
||||
async def fetch_json(page, url: str) -> tuple[str, str, int]:
|
||||
"""Fetch in-page and also read back the Resource Timing transferSize — the actual
|
||||
COMPRESSED bytes on the wire (what the metered proxy bills), not len(body) which is
|
||||
the decompressed size. Returns (status, body, wire_bytes); wire_bytes is -1 if the
|
||||
timing entry wasn't available. Same-origin (cs.money), so the size fields are exposed."""
|
||||
expr = (
|
||||
f"fetch({url!r}, {{credentials:'include', headers:{{'accept':'application/json'}}}})"
|
||||
f".then(async r => JSON.stringify({{status: r.status, body: await r.text()}}))"
|
||||
f".then(async r => {{"
|
||||
f" const body = await r.text();"
|
||||
f" const e = performance.getEntriesByName({url!r}).slice(-1)[0];"
|
||||
f" return JSON.stringify({{status: r.status, body: body,"
|
||||
f" wire: e ? e.transferSize : -1, dec: e ? e.decodedBodySize : -1}});"
|
||||
f"}})"
|
||||
)
|
||||
raw = await page.evaluate(expr, await_promise=True)
|
||||
if not isinstance(raw, str):
|
||||
return ("-1", "")
|
||||
return ("-1", "", -1)
|
||||
try:
|
||||
obj = json.loads(raw)
|
||||
return (str(obj.get("status", "-1")), obj.get("body", ""))
|
||||
except json.JSONDecodeError:
|
||||
return ("-1", raw)
|
||||
return (str(obj.get("status", "-1")), obj.get("body", ""), int(obj.get("wire", -1)))
|
||||
except (json.JSONDecodeError, ValueError, TypeError):
|
||||
return ("-1", raw, -1)
|
||||
|
||||
|
||||
async def _click(page, text, timeout=3):
|
||||
@@ -346,28 +355,32 @@ def extract_items(html: str) -> list:
|
||||
return []
|
||||
|
||||
|
||||
async def scrape_job(page, job) -> tuple[list, int, str]:
|
||||
async def scrape_job(page, job) -> tuple[list, int, str, int]:
|
||||
"""Scrape ALL listings for one skin+wear via a forward float cursor.
|
||||
|
||||
A search page returns at most 60 items and ignores offset, but cs.money sorts by
|
||||
float (order=asc&sort=float) and filters by minFloat. So we walk the float axis:
|
||||
grab the 60 lowest-float items at/above `lo`, advance `lo` to the highest float on
|
||||
the page, and repeat until a page is under the cap. The boundary item is re-fetched
|
||||
(minFloat is inclusive) and dropped by the id dedup. Returns (items, fetches, reason).
|
||||
(minFloat is inclusive) and dropped by the id dedup. Returns
|
||||
(items, fetches, reason, wire_bytes) where wire_bytes is the metered (compressed) cost.
|
||||
"""
|
||||
search = urllib.parse.quote_plus(job["search"])
|
||||
max_fetches = job.get("maxPages", 40) # safety cap on page fetches per job
|
||||
seen: dict = {}
|
||||
fetches = 0
|
||||
wire = 0
|
||||
lo = 0.0
|
||||
reason = "completed"
|
||||
|
||||
while fetches < max_fetches:
|
||||
status, body = await fetch_json(page, PAGE.format(search=search, lo=lo))
|
||||
status, body, wbytes = await fetch_json(page, PAGE.format(search=search, lo=lo))
|
||||
fetches += 1
|
||||
if wbytes > 0:
|
||||
wire += wbytes
|
||||
|
||||
if "Just a moment" in body or "challenge-platform" in body:
|
||||
return list(seen.values()), fetches, "challenged"
|
||||
return list(seen.values()), fetches, "challenged", wire
|
||||
|
||||
items = extract_items(body)
|
||||
floats = []
|
||||
@@ -396,7 +409,7 @@ async def scrape_job(page, job) -> tuple[list, int, str]:
|
||||
else:
|
||||
reason = "fetch-cap"
|
||||
|
||||
return list(seen.values()), fetches, reason
|
||||
return list(seen.values()), fetches, reason, wire
|
||||
|
||||
|
||||
async def main():
|
||||
@@ -429,6 +442,7 @@ async def main():
|
||||
page = await browser.get("about:blank")
|
||||
await warm(page)
|
||||
|
||||
total_wire = 0 # metered (compressed) bytes this worker has pulled, lifetime
|
||||
while True:
|
||||
job = await get_job()
|
||||
if not job:
|
||||
@@ -436,7 +450,8 @@ async def main():
|
||||
continue
|
||||
|
||||
print(f"Job {job['jobId'][:8]} — search {job['search']!r}")
|
||||
items, pages, reason = await scrape_job(page, job)
|
||||
items, pages, reason, wire = await scrape_job(page, job)
|
||||
total_wire += wire
|
||||
|
||||
if reason == "challenged":
|
||||
# The exit IP is likely flagged. On IPRoyal, rotate to a fresh sticky
|
||||
@@ -453,7 +468,9 @@ async def main():
|
||||
"items": items, "pages": pages, "stoppedReason": reason})
|
||||
summary = (f"matched {result.get('matched')}, new {result.get('inserted')}, "
|
||||
f"upd {result.get('updated')}, removed {result.get('removed')}") if result else "post failed"
|
||||
print(f" scraped {len(items)} items ({pages}p, {reason}) -> {summary}")
|
||||
wire_kb = wire / 1024
|
||||
print(f" scraped {len(items)} items ({pages}p, {reason}, {wire_kb:.0f}KB wire) "
|
||||
f"-> {summary} [lifetime {total_wire / 1_048_576:.1f}MB]")
|
||||
|
||||
await page.sleep(DELAY + random.uniform(0, JITTER))
|
||||
finally:
|
||||
|
||||
Reference in New Issue
Block a user