almost ready
This commit is contained in:
20
worker/blworker/__init__.py
Normal file
20
worker/blworker/__init__.py
Normal file
@@ -0,0 +1,20 @@
|
||||
"""Shared scaffolding for the BlueLaminate market scrape workers.
|
||||
|
||||
A market worker (cs.money, skin.land, …) subclasses `Worker`, fills in its scrape +
|
||||
consent steps, and calls `run(MyWorker)`. Everything else — config, logging, the IPRoyal
|
||||
proxy/forwarder, the C2 client, the poll/scrape/post loop, IP rotation, graceful
|
||||
shutdown — lives here so it's written once.
|
||||
"""
|
||||
|
||||
from .config import Settings
|
||||
from .runtime import ScrapeResult, Worker, click, looks_like_challenge, page_fetch, run
|
||||
|
||||
__all__ = [
|
||||
"Settings",
|
||||
"ScrapeResult",
|
||||
"Worker",
|
||||
"click",
|
||||
"looks_like_challenge",
|
||||
"page_fetch",
|
||||
"run",
|
||||
]
|
||||
57
worker/blworker/c2.py
Normal file
57
worker/blworker/c2.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""HTTP client for the .NET C2's job endpoints.
|
||||
|
||||
Stdlib urllib so the blocking calls run off the asyncio loop via to_thread (the event
|
||||
loop belongs to the browser). Each worker points at one job route group — "/jobs" for
|
||||
cs.money, "/skinland/jobs" for skin.land — set once at construction.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
|
||||
log = logging.getLogger("c2")
|
||||
|
||||
|
||||
class C2Client:
|
||||
def __init__(self, base_url: str, token: str, jobs_path: str):
|
||||
self._base = base_url.rstrip("/")
|
||||
self._token = token
|
||||
self._jobs = jobs_path.strip("/")
|
||||
|
||||
def _get_job_sync(self):
|
||||
req = urllib.request.Request(
|
||||
f"{self._base}/{self._jobs}/next", headers={"X-Worker-Token": self._token})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=15) as r:
|
||||
if r.status == 204:
|
||||
return None
|
||||
return json.loads(r.read() or b"null")
|
||||
except urllib.error.HTTPError as e:
|
||||
log.warning("/%s/next -> HTTP %s", self._jobs, e.code)
|
||||
return None
|
||||
except urllib.error.URLError as e:
|
||||
log.warning("C2 unreachable: %s", e)
|
||||
return None
|
||||
|
||||
def _post_result_sync(self, job_id: str, payload: dict):
|
||||
data = json.dumps(payload).encode()
|
||||
req = urllib.request.Request(
|
||||
f"{self._base}/{self._jobs}/{job_id}/result", data=data, method="POST",
|
||||
headers={"X-Worker-Token": self._token, "Content-Type": "application/json"})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=60) as r:
|
||||
return json.loads(r.read() or b"null")
|
||||
except urllib.error.HTTPError as e:
|
||||
log.warning("result -> HTTP %s: %r", e.code, e.read()[:200])
|
||||
return None
|
||||
except urllib.error.URLError as e:
|
||||
log.warning("C2 unreachable posting result: %s", e)
|
||||
return None
|
||||
|
||||
async def get_job(self):
|
||||
return await asyncio.to_thread(self._get_job_sync)
|
||||
|
||||
async def post_result(self, job_id, payload):
|
||||
return await asyncio.to_thread(self._post_result_sync, job_id, payload)
|
||||
81
worker/blworker/config.py
Normal file
81
worker/blworker/config.py
Normal file
@@ -0,0 +1,81 @@
|
||||
"""Worker configuration, parsed once from the environment.
|
||||
|
||||
All env knobs the workers honor live here so there's a single source of truth (the
|
||||
two market workers used to each re-parse the same ~15 vars). Frozen dataclass — read
|
||||
it, don't mutate it.
|
||||
"""
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
def _int(name: str, default: int) -> int:
|
||||
return int(os.environ.get(name, str(default)))
|
||||
|
||||
|
||||
def _float(name: str, default: float) -> float:
|
||||
return float(os.environ.get(name, str(default)))
|
||||
|
||||
|
||||
def _flag(name: str) -> bool:
|
||||
return os.environ.get(name) == "1"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Settings:
|
||||
# C2
|
||||
c2_url: str
|
||||
token: str
|
||||
# Session / pacing
|
||||
market_url: str # "" => use the worker's own default page
|
||||
solve_seconds: int
|
||||
delay: float
|
||||
jitter: float
|
||||
idle_seconds: int
|
||||
# Browser
|
||||
browser_path: str | None
|
||||
load_images: bool
|
||||
chrome_no_sandbox: bool
|
||||
# Proxy (auth-free fallback)
|
||||
proxy: str | None
|
||||
# IPRoyal residential gateway
|
||||
iproyal_host: str
|
||||
iproyal_port: int
|
||||
iproyal_username: str | None
|
||||
iproyal_password: str | None
|
||||
iproyal_country: str
|
||||
iproyal_lifetime_min: int
|
||||
# Logging
|
||||
log_level: str
|
||||
log_json: bool
|
||||
|
||||
@property
|
||||
def use_iproyal(self) -> bool:
|
||||
"""IPRoyal takes priority over a plain PROXY when its creds are set."""
|
||||
return bool(self.iproyal_username and self.iproyal_password)
|
||||
|
||||
@classmethod
|
||||
def from_env(cls) -> "Settings":
|
||||
return cls(
|
||||
c2_url=os.environ.get("C2_URL", "http://localhost:5080").rstrip("/"),
|
||||
token=os.environ.get("WORKER_TOKEN", "dev-worker-token"),
|
||||
market_url=os.environ.get("MARKET_URL", ""),
|
||||
solve_seconds=_int("SOLVE_SECONDS", 30),
|
||||
delay=_float("DELAY", 2.0),
|
||||
jitter=_float("JITTER", 1.5),
|
||||
idle_seconds=_int("IDLE_SECONDS", 10),
|
||||
browser_path=os.environ.get("BROWSER_PATH") or None,
|
||||
# Residential proxy is metered per GB; Cloudflare gates on JS, not images, and
|
||||
# the market APIs are pure JSON — so block images unless explicitly debugging.
|
||||
load_images=_flag("LOAD_IMAGES"),
|
||||
chrome_no_sandbox=_flag("CHROME_NO_SANDBOX"),
|
||||
proxy=os.environ.get("PROXY") or None,
|
||||
iproyal_host=os.environ.get("IPROYAL_HOST", "geo.iproyal.com"),
|
||||
iproyal_port=_int("IPROYAL_PORT", 12321),
|
||||
iproyal_username=os.environ.get("IPROYAL_USERNAME") or None,
|
||||
iproyal_password=os.environ.get("IPROYAL_PASSWORD") or None,
|
||||
iproyal_country=os.environ.get("IPROYAL_COUNTRY", "us").strip().lower(),
|
||||
iproyal_lifetime_min=_int("IPROYAL_LIFETIME_MIN", 60),
|
||||
log_level=os.environ.get("LOG_LEVEL", "INFO").upper(),
|
||||
log_json=_flag("LOG_JSON"),
|
||||
)
|
||||
47
worker/blworker/log.py
Normal file
47
worker/blworker/log.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""Stdlib logging setup — one stream handler on stdout, human or JSON.
|
||||
|
||||
Workers used to print() everything; that gives no levels, no timestamps, and nothing
|
||||
Loki can parse. Default is a compact human format for local runs; set LOG_JSON=1 in the
|
||||
container so Grafana Alloy -> Loki gets structured fields (ts, level, logger, msg) plus
|
||||
any `extra=` keys a call site attaches.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
|
||||
# logging.LogRecord built-ins we don't want to echo into a JSON line as "extra" fields.
|
||||
_RESERVED = set(
|
||||
logging.makeLogRecord({}).__dict__
|
||||
) | {"message", "asctime", "taskName"}
|
||||
|
||||
|
||||
class _JsonFormatter(logging.Formatter):
|
||||
def format(self, record: logging.LogRecord) -> str:
|
||||
payload = {
|
||||
"ts": self.formatTime(record, "%Y-%m-%dT%H:%M:%S%z"),
|
||||
"level": record.levelname,
|
||||
"logger": record.name,
|
||||
"msg": record.getMessage(),
|
||||
}
|
||||
for key, value in record.__dict__.items():
|
||||
if key not in _RESERVED and not key.startswith("_"):
|
||||
payload[key] = value
|
||||
if record.exc_info:
|
||||
payload["exc"] = self.formatException(record.exc_info)
|
||||
return json.dumps(payload, default=str)
|
||||
|
||||
|
||||
def configure(level: str = "INFO", json_logs: bool = False) -> None:
|
||||
"""Install a single stdout handler on the root logger (idempotent)."""
|
||||
handler = logging.StreamHandler(sys.stdout)
|
||||
if json_logs:
|
||||
handler.setFormatter(_JsonFormatter())
|
||||
else:
|
||||
handler.setFormatter(
|
||||
logging.Formatter("%(asctime)s %(levelname)-5s %(name)s | %(message)s", "%H:%M:%S")
|
||||
)
|
||||
root = logging.getLogger()
|
||||
root.handlers.clear()
|
||||
root.addHandler(handler)
|
||||
root.setLevel(level)
|
||||
154
worker/blworker/proxy.py
Normal file
154
worker/blworker/proxy.py
Normal file
@@ -0,0 +1,154 @@
|
||||
"""IPRoyal residential proxy plumbing.
|
||||
|
||||
The in-process forwarder + the password/session helpers — identical across every market
|
||||
worker, so they live here. HTTPS market traffic flows through the CONNECT tunnel, so the
|
||||
forwarder only ever relays ciphertext. Ported from the .NET LocalForwardingProxy /
|
||||
IpRoyalProxyProvider.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import logging
|
||||
import uuid
|
||||
|
||||
log = logging.getLogger("proxy")
|
||||
|
||||
|
||||
def new_session_id() -> str:
|
||||
"""Short, opaque, URL-safe token. IPRoyal pins one residential exit IP per distinct
|
||||
session value, so a fresh id == a fresh IP."""
|
||||
return uuid.uuid4().hex[:10]
|
||||
|
||||
|
||||
def iproyal_password(password: str, country: str, lifetime_min: int, session_id: str) -> str:
|
||||
"""Bake the targeting/session knobs onto the account password, IPRoyal-style:
|
||||
"<pass>_country-us_session-<id>_lifetime-60m". Country is optional."""
|
||||
pw = password
|
||||
if country:
|
||||
pw += f"_country-{country}"
|
||||
pw += f"_session-{session_id}_lifetime-{lifetime_min}m"
|
||||
return pw
|
||||
|
||||
|
||||
class LocalForwardingProxy:
|
||||
"""In-process HTTP proxy on 127.0.0.1 that chains every connection to the IPRoyal
|
||||
gateway, injecting the Proxy-Authorization header itself. Chromium ignores creds in
|
||||
--proxy-server and the in-browser ways to answer the gateway's 407 (a CDP auth
|
||||
handler, or a disabled MV2 extension) are Cloudflare tells — so we terminate the
|
||||
browser->proxy hop locally and add auth here, leaving Chrome to talk to an auth-free
|
||||
endpoint at zero CDP. HTTPS (all market traffic) flows through the CONNECT tunnel, so
|
||||
this proxy only relays ciphertext and never sees plaintext. The active session token
|
||||
can be swapped live (set_password) to move to a fresh exit IP without restarting the
|
||||
browser. (New tunnels pick up the new IP; any still-open keep-alive tunnel stays on
|
||||
the old one until it closes.)"""
|
||||
|
||||
def __init__(self, host: str, port: int, username: str, password: str):
|
||||
self._host = host
|
||||
self._port = port
|
||||
self._username = username
|
||||
self._password = password
|
||||
self._server: asyncio.AbstractServer | None = None
|
||||
self.endpoint = ""
|
||||
|
||||
def set_password(self, password: str) -> None:
|
||||
self._password = password
|
||||
|
||||
def _auth_header(self) -> str:
|
||||
token = base64.b64encode(f"{self._username}:{self._password}".encode()).decode()
|
||||
return f"Proxy-Authorization: Basic {token}\r\n"
|
||||
|
||||
async def start(self) -> "LocalForwardingProxy":
|
||||
self._server = await asyncio.start_server(self._handle, "127.0.0.1", 0)
|
||||
port = self._server.sockets[0].getsockname()[1]
|
||||
self.endpoint = f"127.0.0.1:{port}"
|
||||
return self
|
||||
|
||||
async def stop(self) -> None:
|
||||
if self._server is not None:
|
||||
self._server.close()
|
||||
try:
|
||||
await self._server.wait_closed()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
async def _read_header(reader: asyncio.StreamReader) -> str | None:
|
||||
"""Read up to the end of the HTTP header block (CRLFCRLF). None on EOF/overflow."""
|
||||
try:
|
||||
data = await reader.readuntil(b"\r\n\r\n")
|
||||
except (asyncio.IncompleteReadError, asyncio.LimitOverrunError):
|
||||
return None
|
||||
return data.decode("latin-1")
|
||||
|
||||
async def _handle(self, client_reader: asyncio.StreamReader, client_writer: asyncio.StreamWriter) -> None:
|
||||
up_writer: asyncio.StreamWriter | None = None
|
||||
try:
|
||||
header = await self._read_header(client_reader)
|
||||
if not header:
|
||||
return
|
||||
parts = header.split("\r\n", 1)[0].split(" ")
|
||||
if len(parts) < 2:
|
||||
return
|
||||
method, target = parts[0], parts[1]
|
||||
|
||||
up_reader, up_writer = await asyncio.open_connection(self._host, self._port)
|
||||
if method.upper() == "CONNECT":
|
||||
# HTTPS: open an authenticated tunnel upstream, then relay raw bytes.
|
||||
up_writer.write(
|
||||
f"CONNECT {target} HTTP/1.1\r\nHost: {target}\r\n{self._auth_header()}\r\n".encode())
|
||||
await up_writer.drain()
|
||||
up_header = await self._read_header(up_reader)
|
||||
status = up_header.split(" ", 2) if up_header else []
|
||||
if len(status) < 2 or status[1] != "200":
|
||||
line = (up_header or "no response").split("\r\n", 1)[0]
|
||||
log.warning("upstream refused CONNECT %s: %s", target, line)
|
||||
client_writer.write(b"HTTP/1.1 502 Bad Gateway\r\nConnection: close\r\n\r\n")
|
||||
await client_writer.drain()
|
||||
return
|
||||
client_writer.write(b"HTTP/1.1 200 Connection established\r\n\r\n")
|
||||
await client_writer.drain()
|
||||
else:
|
||||
# Plain HTTP: re-inject the request upstream with auth, then relay.
|
||||
idx = header.index("\r\n") + 2
|
||||
up_writer.write((header[:idx] + self._auth_header() + header[idx:]).encode())
|
||||
await up_writer.drain()
|
||||
|
||||
await self._relay(client_reader, client_writer, up_reader, up_writer)
|
||||
except Exception:
|
||||
pass # one bad tunnel must never take down the listener
|
||||
finally:
|
||||
for w in (client_writer, up_writer):
|
||||
if w is not None:
|
||||
try:
|
||||
w.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
async def _relay(
|
||||
client_reader: asyncio.StreamReader, client_writer: asyncio.StreamWriter,
|
||||
up_reader: asyncio.StreamReader, up_writer: asyncio.StreamWriter) -> None:
|
||||
# Pipe both directions, but tear the whole tunnel down as soon as EITHER side
|
||||
# closes (mirrors the .NET WhenAny). Waiting for both — as a plain gather does —
|
||||
# leaks a task holding two sockets on every half-closed connection, which piles
|
||||
# up fast across a long multi-worker run. Closing both writers when the first pipe
|
||||
# finishes unblocks the other's pending read so both tasks settle.
|
||||
async def pipe(reader: asyncio.StreamReader, writer: asyncio.StreamWriter) -> None:
|
||||
try:
|
||||
while data := await reader.read(65536):
|
||||
writer.write(data)
|
||||
await writer.drain()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
a = asyncio.create_task(pipe(client_reader, up_writer))
|
||||
b = asyncio.create_task(pipe(up_reader, client_writer))
|
||||
try:
|
||||
await asyncio.wait({a, b}, return_when=asyncio.FIRST_COMPLETED)
|
||||
finally:
|
||||
for w in (client_writer, up_writer):
|
||||
try:
|
||||
w.close()
|
||||
except Exception:
|
||||
pass
|
||||
await asyncio.gather(a, b, return_exceptions=True)
|
||||
235
worker/blworker/runtime.py
Normal file
235
worker/blworker/runtime.py
Normal file
@@ -0,0 +1,235 @@
|
||||
"""The shared worker runtime — everything that's identical across market workers.
|
||||
|
||||
`Worker` is a template-method base: it owns the proxy/browser bring-up, the poll ->
|
||||
scrape -> post loop, Cloudflare-driven IP rotation, result logging, and graceful
|
||||
shutdown. A market worker subclasses it and fills in only what differs — how to dismiss
|
||||
the consent banner, how to scrape one job, and how to describe a job in the log. The two
|
||||
~300-line workers used to copy this whole loop verbatim.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import random
|
||||
import signal
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
|
||||
import nodriver as uc
|
||||
|
||||
from .c2 import C2Client
|
||||
from .config import Settings
|
||||
from .proxy import LocalForwardingProxy, iproyal_password, new_session_id
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScrapeResult:
|
||||
"""What a single job scrape yields. `wire_bytes` is the metered (compressed) cost."""
|
||||
items: list
|
||||
pages: int
|
||||
reason: str
|
||||
wire_bytes: int = 0
|
||||
|
||||
|
||||
def looks_like_challenge(body: str) -> bool:
|
||||
"""True for an actual Cloudflare interstitial (or an empty body). Keyed on CF markers,
|
||||
NOT a leading '<' — a real market page IS html, so a startswith('<') check would flag
|
||||
every good page fetch as a challenge."""
|
||||
b = body or ""
|
||||
return not b.strip() or "Just a moment" in b or "challenge-platform" in b
|
||||
|
||||
|
||||
async def page_fetch(page, url: str, accept: str = "application/json") -> tuple[int, str, int]:
|
||||
"""Fetch in-page from the warm (Cloudflare-cleared) session and read back the Resource
|
||||
Timing transferSize — the actual compressed bytes the metered proxy bills (or -1 when
|
||||
cross-origin timing isn't exposed). Returns (status, body, wire_bytes). Use
|
||||
accept='text/html' for an SSR page payload, the default JSON for an API."""
|
||||
expr = (
|
||||
f"fetch({url!r}, {{credentials:'include', headers:{{'accept': {accept!r}}}}})"
|
||||
f".then(async r => {{"
|
||||
f" const body = await r.text();"
|
||||
f" const e = performance.getEntriesByName({url!r}).slice(-1)[0];"
|
||||
f" return JSON.stringify({{status: r.status, body: body, wire: e ? e.transferSize : -1}});"
|
||||
f"}}).catch(e => JSON.stringify({{status: -1, body: String(e), wire: -1}}))"
|
||||
)
|
||||
raw = await page.evaluate(expr, await_promise=True)
|
||||
if not isinstance(raw, str):
|
||||
return (-1, "", -1)
|
||||
try:
|
||||
obj = json.loads(raw)
|
||||
return (int(obj.get("status", -1)), obj.get("body", ""), int(obj.get("wire", -1)))
|
||||
except (json.JSONDecodeError, ValueError, TypeError):
|
||||
return (-1, raw, -1)
|
||||
|
||||
|
||||
async def click(page, text: str, timeout: int = 3) -> bool:
|
||||
"""Best-match click on visible text; swallow the not-found/timeout case."""
|
||||
try:
|
||||
el = await page.find(text, best_match=True, timeout=timeout)
|
||||
if el:
|
||||
await el.click()
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
class Worker(ABC):
|
||||
# Per-market constants, set by the subclass.
|
||||
name: str = "worker"
|
||||
jobs_path: str = "/jobs"
|
||||
default_market_url: str = ""
|
||||
|
||||
def __init__(self, settings: Settings):
|
||||
self.settings = settings
|
||||
self.market_url = settings.market_url or self.default_market_url
|
||||
self.c2 = C2Client(settings.c2_url, settings.token, self.jobs_path)
|
||||
self.log = logging.getLogger(self.name)
|
||||
self._forwarder: LocalForwardingProxy | None = None
|
||||
self._session_id: str | None = None
|
||||
self._stop = asyncio.Event()
|
||||
|
||||
# --- hooks a market worker overrides ------------------------------------------
|
||||
|
||||
@abstractmethod
|
||||
async def scrape_job(self, page, job) -> ScrapeResult:
|
||||
"""Scrape ALL listings for one job and return them."""
|
||||
|
||||
@abstractmethod
|
||||
def describe_job(self, job) -> str:
|
||||
"""One-line job description for the log (e.g. the search term or slug)."""
|
||||
|
||||
async def dismiss_consent(self, page) -> str | None:
|
||||
"""Dismiss the cookie banner privacy-first; return a note, or None if absent.
|
||||
Default: nothing to do. Markets with a banner override this."""
|
||||
return None
|
||||
|
||||
# --- shared machinery ---------------------------------------------------------
|
||||
|
||||
def _iproyal_password(self, session_id: str) -> str:
|
||||
s = self.settings
|
||||
return iproyal_password(s.iproyal_password, s.iproyal_country, s.iproyal_lifetime_min, session_id)
|
||||
|
||||
async def _pace(self, page) -> None:
|
||||
await page.sleep(self.settings.delay + random.uniform(0, self.settings.jitter))
|
||||
|
||||
async def warm(self, page) -> None:
|
||||
"""Open the market and clear Cloudflare so the session holds cf_clearance."""
|
||||
s = self.settings
|
||||
self.log.info("warming session at %s (clear Cloudflare; %ds)", self.market_url, s.solve_seconds)
|
||||
await page.get(self.market_url)
|
||||
await page.sleep(s.solve_seconds)
|
||||
note = await self.dismiss_consent(page)
|
||||
self.log.info("consent: %s", note or "left up")
|
||||
|
||||
async def _setup_proxy(self) -> tuple[str | None, str]:
|
||||
"""IPRoyal (auth'd, per-worker sticky IP) takes priority; else a plain auth-free
|
||||
PROXY; else this host's own IP. Returns (proxy_endpoint, human_label)."""
|
||||
s = self.settings
|
||||
if s.use_iproyal:
|
||||
self._session_id = new_session_id()
|
||||
self._forwarder = await LocalForwardingProxy(
|
||||
s.iproyal_host, s.iproyal_port, s.iproyal_username,
|
||||
self._iproyal_password(self._session_id)).start()
|
||||
label = f"iproyal[{s.iproyal_country or 'any'}] session {self._session_id} via {self._forwarder.endpoint}"
|
||||
return self._forwarder.endpoint, label
|
||||
return s.proxy, (s.proxy or "own IP")
|
||||
|
||||
def _browser_args(self, proxy: str | None) -> list[str]:
|
||||
s = self.settings
|
||||
args = [f"--proxy-server={proxy}"] if proxy else []
|
||||
if not s.load_images:
|
||||
# Disable image loading at the engine level — the dominant bandwidth cost on
|
||||
# an image-heavy market, and unneeded for CF clearance or the JSON API.
|
||||
args.append("--blink-settings=imagesEnabled=false")
|
||||
if s.chrome_no_sandbox:
|
||||
# Required when running Chromium as root in a container.
|
||||
args += ["--no-sandbox", "--disable-dev-shm-usage"]
|
||||
return args
|
||||
|
||||
async def _on_challenge(self, page) -> None:
|
||||
"""The exit IP is likely flagged. On IPRoyal, rotate to a fresh sticky session
|
||||
(new IP) before re-warming; otherwise just re-solve in place."""
|
||||
if self._forwarder is not None:
|
||||
self._session_id = new_session_id()
|
||||
self._forwarder.set_password(self._iproyal_password(self._session_id))
|
||||
self.log.warning("challenged; rotating exit IP -> session %s, re-warming", self._session_id)
|
||||
else:
|
||||
self.log.warning("challenged; re-warming session")
|
||||
await self.warm(page)
|
||||
|
||||
def _log_result(self, res: ScrapeResult, posted: dict | None, total_wire: int) -> None:
|
||||
if posted:
|
||||
summary = (f"matched {posted.get('matched')}, new {posted.get('inserted')}, "
|
||||
f"upd {posted.get('updated')}, removed {posted.get('removed')}")
|
||||
else:
|
||||
summary = "post failed"
|
||||
self.log.info("scraped %d items (%dp, %s, %.0fKB wire) -> %s [lifetime %.1fMB]",
|
||||
len(res.items), res.pages, res.reason, res.wire_bytes / 1024,
|
||||
summary, total_wire / 1_048_576)
|
||||
|
||||
def _install_signal_handlers(self) -> None:
|
||||
"""Stop the loop on SIGINT/SIGTERM so `docker stop` shuts down cleanly. Not
|
||||
supported on Windows (ProactorEventLoop) — there Ctrl-C still raises
|
||||
KeyboardInterrupt, which the run loop's finally handles just as well."""
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
for sig in (signal.SIGINT, signal.SIGTERM):
|
||||
loop.add_signal_handler(sig, self._stop.set)
|
||||
except (NotImplementedError, AttributeError):
|
||||
pass
|
||||
|
||||
async def _idle(self) -> None:
|
||||
"""Sleep when the C2 has no work, but wake immediately on shutdown."""
|
||||
try:
|
||||
await asyncio.wait_for(self._stop.wait(), timeout=self.settings.idle_seconds)
|
||||
except asyncio.TimeoutError:
|
||||
pass
|
||||
|
||||
async def run(self) -> None:
|
||||
self._install_signal_handlers()
|
||||
s = self.settings
|
||||
proxy, proxy_label = await self._setup_proxy()
|
||||
self.log.info("starting (C2=%s, proxy=%s, images=%s)",
|
||||
s.c2_url, proxy_label, "on" if s.load_images else "off")
|
||||
browser = await uc.start(
|
||||
headless=False, browser_executable_path=s.browser_path,
|
||||
browser_args=self._browser_args(proxy))
|
||||
try:
|
||||
page = await browser.get("about:blank")
|
||||
await self.warm(page)
|
||||
|
||||
total_wire = 0 # metered (compressed) bytes pulled, lifetime
|
||||
while not self._stop.is_set():
|
||||
job = await self.c2.get_job()
|
||||
if not job:
|
||||
await self._idle()
|
||||
continue
|
||||
|
||||
self.log.info("job %s — %s", job["jobId"][:8], self.describe_job(job))
|
||||
res = await self.scrape_job(page, job)
|
||||
total_wire += res.wire_bytes
|
||||
|
||||
if res.reason == "challenged":
|
||||
await self._on_challenge(page)
|
||||
|
||||
posted = await self.c2.post_result(job["jobId"], {
|
||||
"items": res.items, "pages": res.pages, "stoppedReason": res.reason})
|
||||
self._log_result(res, posted, total_wire)
|
||||
|
||||
await self._pace(page)
|
||||
finally:
|
||||
self.log.info("shutting down")
|
||||
browser.stop()
|
||||
if self._forwarder is not None:
|
||||
await self._forwarder.stop()
|
||||
|
||||
|
||||
def run(worker_cls: type[Worker]) -> None:
|
||||
"""Boot a worker from the environment: parse config, set up logging, run the loop on
|
||||
nodriver's event loop. The thin market scripts call this and nothing else."""
|
||||
from . import log as log_setup
|
||||
|
||||
settings = Settings.from_env()
|
||||
log_setup.configure(settings.log_level, settings.log_json)
|
||||
uc.loop().run_until_complete(worker_cls(settings).run())
|
||||
Reference in New Issue
Block a user