almost ready

This commit is contained in:
bob
2026-06-01 10:52:06 -05:00
parent 8b0eb0db78
commit 763305ca89
94 changed files with 8766 additions and 2674 deletions

View File

@@ -0,0 +1,20 @@
"""Shared scaffolding for the BlueLaminate market scrape workers.
A market worker (cs.money, skin.land, …) subclasses `Worker`, fills in its scrape +
consent steps, and calls `run(MyWorker)`. Everything else — config, logging, the IPRoyal
proxy/forwarder, the C2 client, the poll/scrape/post loop, IP rotation, graceful
shutdown — lives here so it's written once.
"""
from .config import Settings
from .runtime import ScrapeResult, Worker, click, looks_like_challenge, page_fetch, run
__all__ = [
"Settings",
"ScrapeResult",
"Worker",
"click",
"looks_like_challenge",
"page_fetch",
"run",
]

57
worker/blworker/c2.py Normal file
View File

@@ -0,0 +1,57 @@
"""HTTP client for the .NET C2's job endpoints.
Stdlib urllib so the blocking calls run off the asyncio loop via to_thread (the event
loop belongs to the browser). Each worker points at one job route group — "/jobs" for
cs.money, "/skinland/jobs" for skin.land — set once at construction.
"""
import asyncio
import json
import logging
import urllib.error
import urllib.request
log = logging.getLogger("c2")
class C2Client:
def __init__(self, base_url: str, token: str, jobs_path: str):
self._base = base_url.rstrip("/")
self._token = token
self._jobs = jobs_path.strip("/")
def _get_job_sync(self):
req = urllib.request.Request(
f"{self._base}/{self._jobs}/next", headers={"X-Worker-Token": self._token})
try:
with urllib.request.urlopen(req, timeout=15) as r:
if r.status == 204:
return None
return json.loads(r.read() or b"null")
except urllib.error.HTTPError as e:
log.warning("/%s/next -> HTTP %s", self._jobs, e.code)
return None
except urllib.error.URLError as e:
log.warning("C2 unreachable: %s", e)
return None
def _post_result_sync(self, job_id: str, payload: dict):
data = json.dumps(payload).encode()
req = urllib.request.Request(
f"{self._base}/{self._jobs}/{job_id}/result", data=data, method="POST",
headers={"X-Worker-Token": self._token, "Content-Type": "application/json"})
try:
with urllib.request.urlopen(req, timeout=60) as r:
return json.loads(r.read() or b"null")
except urllib.error.HTTPError as e:
log.warning("result -> HTTP %s: %r", e.code, e.read()[:200])
return None
except urllib.error.URLError as e:
log.warning("C2 unreachable posting result: %s", e)
return None
async def get_job(self):
return await asyncio.to_thread(self._get_job_sync)
async def post_result(self, job_id, payload):
return await asyncio.to_thread(self._post_result_sync, job_id, payload)

81
worker/blworker/config.py Normal file
View File

@@ -0,0 +1,81 @@
"""Worker configuration, parsed once from the environment.
All env knobs the workers honor live here so there's a single source of truth (the
two market workers used to each re-parse the same ~15 vars). Frozen dataclass — read
it, don't mutate it.
"""
import os
from dataclasses import dataclass
def _int(name: str, default: int) -> int:
return int(os.environ.get(name, str(default)))
def _float(name: str, default: float) -> float:
return float(os.environ.get(name, str(default)))
def _flag(name: str) -> bool:
return os.environ.get(name) == "1"
@dataclass(frozen=True)
class Settings:
# C2
c2_url: str
token: str
# Session / pacing
market_url: str # "" => use the worker's own default page
solve_seconds: int
delay: float
jitter: float
idle_seconds: int
# Browser
browser_path: str | None
load_images: bool
chrome_no_sandbox: bool
# Proxy (auth-free fallback)
proxy: str | None
# IPRoyal residential gateway
iproyal_host: str
iproyal_port: int
iproyal_username: str | None
iproyal_password: str | None
iproyal_country: str
iproyal_lifetime_min: int
# Logging
log_level: str
log_json: bool
@property
def use_iproyal(self) -> bool:
"""IPRoyal takes priority over a plain PROXY when its creds are set."""
return bool(self.iproyal_username and self.iproyal_password)
@classmethod
def from_env(cls) -> "Settings":
return cls(
c2_url=os.environ.get("C2_URL", "http://localhost:5080").rstrip("/"),
token=os.environ.get("WORKER_TOKEN", "dev-worker-token"),
market_url=os.environ.get("MARKET_URL", ""),
solve_seconds=_int("SOLVE_SECONDS", 30),
delay=_float("DELAY", 2.0),
jitter=_float("JITTER", 1.5),
idle_seconds=_int("IDLE_SECONDS", 10),
browser_path=os.environ.get("BROWSER_PATH") or None,
# Residential proxy is metered per GB; Cloudflare gates on JS, not images, and
# the market APIs are pure JSON — so block images unless explicitly debugging.
load_images=_flag("LOAD_IMAGES"),
chrome_no_sandbox=_flag("CHROME_NO_SANDBOX"),
proxy=os.environ.get("PROXY") or None,
iproyal_host=os.environ.get("IPROYAL_HOST", "geo.iproyal.com"),
iproyal_port=_int("IPROYAL_PORT", 12321),
iproyal_username=os.environ.get("IPROYAL_USERNAME") or None,
iproyal_password=os.environ.get("IPROYAL_PASSWORD") or None,
iproyal_country=os.environ.get("IPROYAL_COUNTRY", "us").strip().lower(),
iproyal_lifetime_min=_int("IPROYAL_LIFETIME_MIN", 60),
log_level=os.environ.get("LOG_LEVEL", "INFO").upper(),
log_json=_flag("LOG_JSON"),
)

47
worker/blworker/log.py Normal file
View File

@@ -0,0 +1,47 @@
"""Stdlib logging setup — one stream handler on stdout, human or JSON.
Workers used to print() everything; that gives no levels, no timestamps, and nothing
Loki can parse. Default is a compact human format for local runs; set LOG_JSON=1 in the
container so Grafana Alloy -> Loki gets structured fields (ts, level, logger, msg) plus
any `extra=` keys a call site attaches.
"""
import json
import logging
import sys
# logging.LogRecord built-ins we don't want to echo into a JSON line as "extra" fields.
_RESERVED = set(
logging.makeLogRecord({}).__dict__
) | {"message", "asctime", "taskName"}
class _JsonFormatter(logging.Formatter):
def format(self, record: logging.LogRecord) -> str:
payload = {
"ts": self.formatTime(record, "%Y-%m-%dT%H:%M:%S%z"),
"level": record.levelname,
"logger": record.name,
"msg": record.getMessage(),
}
for key, value in record.__dict__.items():
if key not in _RESERVED and not key.startswith("_"):
payload[key] = value
if record.exc_info:
payload["exc"] = self.formatException(record.exc_info)
return json.dumps(payload, default=str)
def configure(level: str = "INFO", json_logs: bool = False) -> None:
"""Install a single stdout handler on the root logger (idempotent)."""
handler = logging.StreamHandler(sys.stdout)
if json_logs:
handler.setFormatter(_JsonFormatter())
else:
handler.setFormatter(
logging.Formatter("%(asctime)s %(levelname)-5s %(name)s | %(message)s", "%H:%M:%S")
)
root = logging.getLogger()
root.handlers.clear()
root.addHandler(handler)
root.setLevel(level)

154
worker/blworker/proxy.py Normal file
View File

@@ -0,0 +1,154 @@
"""IPRoyal residential proxy plumbing.
The in-process forwarder + the password/session helpers — identical across every market
worker, so they live here. HTTPS market traffic flows through the CONNECT tunnel, so the
forwarder only ever relays ciphertext. Ported from the .NET LocalForwardingProxy /
IpRoyalProxyProvider.
"""
import asyncio
import base64
import logging
import uuid
log = logging.getLogger("proxy")
def new_session_id() -> str:
"""Short, opaque, URL-safe token. IPRoyal pins one residential exit IP per distinct
session value, so a fresh id == a fresh IP."""
return uuid.uuid4().hex[:10]
def iproyal_password(password: str, country: str, lifetime_min: int, session_id: str) -> str:
"""Bake the targeting/session knobs onto the account password, IPRoyal-style:
"<pass>_country-us_session-<id>_lifetime-60m". Country is optional."""
pw = password
if country:
pw += f"_country-{country}"
pw += f"_session-{session_id}_lifetime-{lifetime_min}m"
return pw
class LocalForwardingProxy:
"""In-process HTTP proxy on 127.0.0.1 that chains every connection to the IPRoyal
gateway, injecting the Proxy-Authorization header itself. Chromium ignores creds in
--proxy-server and the in-browser ways to answer the gateway's 407 (a CDP auth
handler, or a disabled MV2 extension) are Cloudflare tells — so we terminate the
browser->proxy hop locally and add auth here, leaving Chrome to talk to an auth-free
endpoint at zero CDP. HTTPS (all market traffic) flows through the CONNECT tunnel, so
this proxy only relays ciphertext and never sees plaintext. The active session token
can be swapped live (set_password) to move to a fresh exit IP without restarting the
browser. (New tunnels pick up the new IP; any still-open keep-alive tunnel stays on
the old one until it closes.)"""
def __init__(self, host: str, port: int, username: str, password: str):
self._host = host
self._port = port
self._username = username
self._password = password
self._server: asyncio.AbstractServer | None = None
self.endpoint = ""
def set_password(self, password: str) -> None:
self._password = password
def _auth_header(self) -> str:
token = base64.b64encode(f"{self._username}:{self._password}".encode()).decode()
return f"Proxy-Authorization: Basic {token}\r\n"
async def start(self) -> "LocalForwardingProxy":
self._server = await asyncio.start_server(self._handle, "127.0.0.1", 0)
port = self._server.sockets[0].getsockname()[1]
self.endpoint = f"127.0.0.1:{port}"
return self
async def stop(self) -> None:
if self._server is not None:
self._server.close()
try:
await self._server.wait_closed()
except Exception:
pass
@staticmethod
async def _read_header(reader: asyncio.StreamReader) -> str | None:
"""Read up to the end of the HTTP header block (CRLFCRLF). None on EOF/overflow."""
try:
data = await reader.readuntil(b"\r\n\r\n")
except (asyncio.IncompleteReadError, asyncio.LimitOverrunError):
return None
return data.decode("latin-1")
async def _handle(self, client_reader: asyncio.StreamReader, client_writer: asyncio.StreamWriter) -> None:
up_writer: asyncio.StreamWriter | None = None
try:
header = await self._read_header(client_reader)
if not header:
return
parts = header.split("\r\n", 1)[0].split(" ")
if len(parts) < 2:
return
method, target = parts[0], parts[1]
up_reader, up_writer = await asyncio.open_connection(self._host, self._port)
if method.upper() == "CONNECT":
# HTTPS: open an authenticated tunnel upstream, then relay raw bytes.
up_writer.write(
f"CONNECT {target} HTTP/1.1\r\nHost: {target}\r\n{self._auth_header()}\r\n".encode())
await up_writer.drain()
up_header = await self._read_header(up_reader)
status = up_header.split(" ", 2) if up_header else []
if len(status) < 2 or status[1] != "200":
line = (up_header or "no response").split("\r\n", 1)[0]
log.warning("upstream refused CONNECT %s: %s", target, line)
client_writer.write(b"HTTP/1.1 502 Bad Gateway\r\nConnection: close\r\n\r\n")
await client_writer.drain()
return
client_writer.write(b"HTTP/1.1 200 Connection established\r\n\r\n")
await client_writer.drain()
else:
# Plain HTTP: re-inject the request upstream with auth, then relay.
idx = header.index("\r\n") + 2
up_writer.write((header[:idx] + self._auth_header() + header[idx:]).encode())
await up_writer.drain()
await self._relay(client_reader, client_writer, up_reader, up_writer)
except Exception:
pass # one bad tunnel must never take down the listener
finally:
for w in (client_writer, up_writer):
if w is not None:
try:
w.close()
except Exception:
pass
@staticmethod
async def _relay(
client_reader: asyncio.StreamReader, client_writer: asyncio.StreamWriter,
up_reader: asyncio.StreamReader, up_writer: asyncio.StreamWriter) -> None:
# Pipe both directions, but tear the whole tunnel down as soon as EITHER side
# closes (mirrors the .NET WhenAny). Waiting for both — as a plain gather does —
# leaks a task holding two sockets on every half-closed connection, which piles
# up fast across a long multi-worker run. Closing both writers when the first pipe
# finishes unblocks the other's pending read so both tasks settle.
async def pipe(reader: asyncio.StreamReader, writer: asyncio.StreamWriter) -> None:
try:
while data := await reader.read(65536):
writer.write(data)
await writer.drain()
except Exception:
pass
a = asyncio.create_task(pipe(client_reader, up_writer))
b = asyncio.create_task(pipe(up_reader, client_writer))
try:
await asyncio.wait({a, b}, return_when=asyncio.FIRST_COMPLETED)
finally:
for w in (client_writer, up_writer):
try:
w.close()
except Exception:
pass
await asyncio.gather(a, b, return_exceptions=True)

235
worker/blworker/runtime.py Normal file
View File

@@ -0,0 +1,235 @@
"""The shared worker runtime — everything that's identical across market workers.
`Worker` is a template-method base: it owns the proxy/browser bring-up, the poll ->
scrape -> post loop, Cloudflare-driven IP rotation, result logging, and graceful
shutdown. A market worker subclasses it and fills in only what differs — how to dismiss
the consent banner, how to scrape one job, and how to describe a job in the log. The two
~300-line workers used to copy this whole loop verbatim.
"""
import asyncio
import json
import logging
import random
import signal
from abc import ABC, abstractmethod
from dataclasses import dataclass
import nodriver as uc
from .c2 import C2Client
from .config import Settings
from .proxy import LocalForwardingProxy, iproyal_password, new_session_id
@dataclass
class ScrapeResult:
"""What a single job scrape yields. `wire_bytes` is the metered (compressed) cost."""
items: list
pages: int
reason: str
wire_bytes: int = 0
def looks_like_challenge(body: str) -> bool:
"""True for an actual Cloudflare interstitial (or an empty body). Keyed on CF markers,
NOT a leading '<' — a real market page IS html, so a startswith('<') check would flag
every good page fetch as a challenge."""
b = body or ""
return not b.strip() or "Just a moment" in b or "challenge-platform" in b
async def page_fetch(page, url: str, accept: str = "application/json") -> tuple[int, str, int]:
"""Fetch in-page from the warm (Cloudflare-cleared) session and read back the Resource
Timing transferSize — the actual compressed bytes the metered proxy bills (or -1 when
cross-origin timing isn't exposed). Returns (status, body, wire_bytes). Use
accept='text/html' for an SSR page payload, the default JSON for an API."""
expr = (
f"fetch({url!r}, {{credentials:'include', headers:{{'accept': {accept!r}}}}})"
f".then(async r => {{"
f" const body = await r.text();"
f" const e = performance.getEntriesByName({url!r}).slice(-1)[0];"
f" return JSON.stringify({{status: r.status, body: body, wire: e ? e.transferSize : -1}});"
f"}}).catch(e => JSON.stringify({{status: -1, body: String(e), wire: -1}}))"
)
raw = await page.evaluate(expr, await_promise=True)
if not isinstance(raw, str):
return (-1, "", -1)
try:
obj = json.loads(raw)
return (int(obj.get("status", -1)), obj.get("body", ""), int(obj.get("wire", -1)))
except (json.JSONDecodeError, ValueError, TypeError):
return (-1, raw, -1)
async def click(page, text: str, timeout: int = 3) -> bool:
"""Best-match click on visible text; swallow the not-found/timeout case."""
try:
el = await page.find(text, best_match=True, timeout=timeout)
if el:
await el.click()
return True
except Exception:
pass
return False
class Worker(ABC):
# Per-market constants, set by the subclass.
name: str = "worker"
jobs_path: str = "/jobs"
default_market_url: str = ""
def __init__(self, settings: Settings):
self.settings = settings
self.market_url = settings.market_url or self.default_market_url
self.c2 = C2Client(settings.c2_url, settings.token, self.jobs_path)
self.log = logging.getLogger(self.name)
self._forwarder: LocalForwardingProxy | None = None
self._session_id: str | None = None
self._stop = asyncio.Event()
# --- hooks a market worker overrides ------------------------------------------
@abstractmethod
async def scrape_job(self, page, job) -> ScrapeResult:
"""Scrape ALL listings for one job and return them."""
@abstractmethod
def describe_job(self, job) -> str:
"""One-line job description for the log (e.g. the search term or slug)."""
async def dismiss_consent(self, page) -> str | None:
"""Dismiss the cookie banner privacy-first; return a note, or None if absent.
Default: nothing to do. Markets with a banner override this."""
return None
# --- shared machinery ---------------------------------------------------------
def _iproyal_password(self, session_id: str) -> str:
s = self.settings
return iproyal_password(s.iproyal_password, s.iproyal_country, s.iproyal_lifetime_min, session_id)
async def _pace(self, page) -> None:
await page.sleep(self.settings.delay + random.uniform(0, self.settings.jitter))
async def warm(self, page) -> None:
"""Open the market and clear Cloudflare so the session holds cf_clearance."""
s = self.settings
self.log.info("warming session at %s (clear Cloudflare; %ds)", self.market_url, s.solve_seconds)
await page.get(self.market_url)
await page.sleep(s.solve_seconds)
note = await self.dismiss_consent(page)
self.log.info("consent: %s", note or "left up")
async def _setup_proxy(self) -> tuple[str | None, str]:
"""IPRoyal (auth'd, per-worker sticky IP) takes priority; else a plain auth-free
PROXY; else this host's own IP. Returns (proxy_endpoint, human_label)."""
s = self.settings
if s.use_iproyal:
self._session_id = new_session_id()
self._forwarder = await LocalForwardingProxy(
s.iproyal_host, s.iproyal_port, s.iproyal_username,
self._iproyal_password(self._session_id)).start()
label = f"iproyal[{s.iproyal_country or 'any'}] session {self._session_id} via {self._forwarder.endpoint}"
return self._forwarder.endpoint, label
return s.proxy, (s.proxy or "own IP")
def _browser_args(self, proxy: str | None) -> list[str]:
s = self.settings
args = [f"--proxy-server={proxy}"] if proxy else []
if not s.load_images:
# Disable image loading at the engine level — the dominant bandwidth cost on
# an image-heavy market, and unneeded for CF clearance or the JSON API.
args.append("--blink-settings=imagesEnabled=false")
if s.chrome_no_sandbox:
# Required when running Chromium as root in a container.
args += ["--no-sandbox", "--disable-dev-shm-usage"]
return args
async def _on_challenge(self, page) -> None:
"""The exit IP is likely flagged. On IPRoyal, rotate to a fresh sticky session
(new IP) before re-warming; otherwise just re-solve in place."""
if self._forwarder is not None:
self._session_id = new_session_id()
self._forwarder.set_password(self._iproyal_password(self._session_id))
self.log.warning("challenged; rotating exit IP -> session %s, re-warming", self._session_id)
else:
self.log.warning("challenged; re-warming session")
await self.warm(page)
def _log_result(self, res: ScrapeResult, posted: dict | None, total_wire: int) -> None:
if posted:
summary = (f"matched {posted.get('matched')}, new {posted.get('inserted')}, "
f"upd {posted.get('updated')}, removed {posted.get('removed')}")
else:
summary = "post failed"
self.log.info("scraped %d items (%dp, %s, %.0fKB wire) -> %s [lifetime %.1fMB]",
len(res.items), res.pages, res.reason, res.wire_bytes / 1024,
summary, total_wire / 1_048_576)
def _install_signal_handlers(self) -> None:
"""Stop the loop on SIGINT/SIGTERM so `docker stop` shuts down cleanly. Not
supported on Windows (ProactorEventLoop) — there Ctrl-C still raises
KeyboardInterrupt, which the run loop's finally handles just as well."""
try:
loop = asyncio.get_running_loop()
for sig in (signal.SIGINT, signal.SIGTERM):
loop.add_signal_handler(sig, self._stop.set)
except (NotImplementedError, AttributeError):
pass
async def _idle(self) -> None:
"""Sleep when the C2 has no work, but wake immediately on shutdown."""
try:
await asyncio.wait_for(self._stop.wait(), timeout=self.settings.idle_seconds)
except asyncio.TimeoutError:
pass
async def run(self) -> None:
self._install_signal_handlers()
s = self.settings
proxy, proxy_label = await self._setup_proxy()
self.log.info("starting (C2=%s, proxy=%s, images=%s)",
s.c2_url, proxy_label, "on" if s.load_images else "off")
browser = await uc.start(
headless=False, browser_executable_path=s.browser_path,
browser_args=self._browser_args(proxy))
try:
page = await browser.get("about:blank")
await self.warm(page)
total_wire = 0 # metered (compressed) bytes pulled, lifetime
while not self._stop.is_set():
job = await self.c2.get_job()
if not job:
await self._idle()
continue
self.log.info("job %s%s", job["jobId"][:8], self.describe_job(job))
res = await self.scrape_job(page, job)
total_wire += res.wire_bytes
if res.reason == "challenged":
await self._on_challenge(page)
posted = await self.c2.post_result(job["jobId"], {
"items": res.items, "pages": res.pages, "stoppedReason": res.reason})
self._log_result(res, posted, total_wire)
await self._pace(page)
finally:
self.log.info("shutting down")
browser.stop()
if self._forwarder is not None:
await self._forwarder.stop()
def run(worker_cls: type[Worker]) -> None:
"""Boot a worker from the environment: parse config, set up logging, run the loop on
nodriver's event loop. The thin market scripts call this and nothing else."""
from . import log as log_setup
settings = Settings.from_env()
log_setup.configure(settings.log_level, settings.log_json)
uc.loop().run_until_complete(worker_cls(settings).run())