Operation-Blue-Laminate-v2/worker/discover_pagination.py

"""
Discover how cs.money paginates a filtered search past the initial ~60 SSR items.

Tests two hypotheses against a high-result search (default "ak-47 redline", which has
well over 60 listings):

  A. Does the SSR page honor offset/limit in the URL? Fetch ?search=...&offset=60 and
     ?search=...&limit=120 and compare item ids to page 1. If disjoint/larger, we can
     paginate cheaply by re-fetching the page.
  B. The real client "load more": scroll hard to trigger lazy-load and capture any
     cs.money /2.0/ XHR via Resource Timing — that request carries the structured
     filter params + offset, i.e. a lighter direct-API pagination path.

Findings are printed and saved to captures/_pagination.txt.

    cd worker; .venv\\Scripts\\Activate.ps1
    python discover_pagination.py
    $env:SEARCH="ak-47 redline"; python discover_pagination.py   # override the search
"""

import json
import os
import pathlib
import re

import nodriver as uc
from nodriver import cdp

SEARCH = os.environ.get("SEARCH", "ak-47 redline")
SOLVE_SECONDS = int(os.environ.get("SOLVE_SECONDS", "30"))
BROWSER_PATH = os.environ.get("BROWSER_PATH")
PROXY = os.environ.get("PROXY")

BASE = "https://cs.money/market/buy/"
PAGE_PARAMS_RE = re.compile(r'<script\b[^>]*id="__page-params"[^>]*>(.*?)</script>', re.S)
OUT = pathlib.Path(__file__).parent / "captures"
CONSENT = ["Reject all", "Only necessary", "Reject", "Decline", "Deny"]

# Aggressive scroll: window + every scrollable container (the grid scrolls in a div,
# which is why a plain window.scrollTo didn't trigger lazy-load before).
SCROLL_JS = (
    "window.scrollTo(0, document.body.scrollHeight);"
    "document.querySelectorAll('*').forEach(e=>{"
    "  if (e.scrollHeight > e.clientHeight + 80) e.scrollTop = e.scrollHeight;});")


async def js(page, expr):
    raw = await page.evaluate(f"JSON.stringify({expr})")
    try:
        return json.loads(raw) if isinstance(raw, str) else None
    except (json.JSONDecodeError, TypeError):
        return None


async def fetch_text(page, url):
    expr = (f"fetch({url!r},{{credentials:'include'}}).then(async r=>"
            f"JSON.stringify({{status:r.status, body:await r.text()}}))")
    raw = await page.evaluate(expr, await_promise=True)
    try:
        o = json.loads(raw)
        return o.get("status"), o.get("body", "")
    except (json.JSONDecodeError, TypeError):
        return None, ""


def page_item_ids(html):
    m = PAGE_PARAMS_RE.search(html or "")
    if not m:
        return []
    try:
        return [it.get("id") for it in json.loads(m.group(1)).get("inventory", {}).get("items", [])]
    except json.JSONDecodeError:
        return []


async def click_visible(page, pattern):
    """Click the first VISIBLE element whose trimmed text matches `pattern` (case-
    insensitive). nodriver's find() was matching hidden/duplicate nodes; restricting
    to offsetParent!=null + short text hits the real button."""
    expr = ("JSON.stringify((()=>{"
            "const re=new RegExp(" + json.dumps(pattern) + ",'i');"
            "const els=[...document.querySelectorAll('button,a,[role=\"button\"],span,div')];"
            "const b=els.find(e=>e.offsetParent!==null && (e.textContent||'').trim().length<40 "
            "&& re.test((e.textContent||'').trim()));"
            "if(b){b.click();return true}return false})())")
    r = await page.evaluate(expr)
    return isinstance(r, str) and "true" in r


async def banner_present(page):
    r = await page.evaluate(
        "JSON.stringify(/Manage cookies|Accept all/i.test(document.body.innerText||''))")
    return isinstance(r, str) and "true" in r


async def dismiss(page):
    """Privacy-preserving first (Manage -> Reject all -> Confirm); if the banner is
    still up, fall back to Accept all so the page becomes interactive (discovery
    needs scrolling to work)."""
    steps = []
    if await click_visible(page, "manage cookies|^manage$"):
        steps.append("manage")
        await page.sleep(1.2)
        if await click_visible(page, "reject all"):
            steps.append("reject-all")
        await page.sleep(0.4)
        for c in ("confirm my choice", "^confirm$", "^save$"):
            if await click_visible(page, c):
                steps.append("confirm")
                break
    await page.sleep(1)
    if await banner_present(page):
        steps.append("still-up->accept" if await click_visible(page, "accept all|^accept$") else "still-up")
    await page.sleep(0.5)
    steps.append("gone" if not await banner_present(page) else "STILL-PRESENT")
    return ", ".join(steps)


async def main():
    OUT.mkdir(exist_ok=True)
    args = [f"--proxy-server={PROXY}"] if PROXY else []
    args.append("--blink-settings=imagesEnabled=false")
    from urllib.parse import quote_plus
    q = quote_plus(SEARCH)
    findings = []

    browser = await uc.start(headless=False, browser_executable_path=BROWSER_PATH, browser_args=args)
    try:
        url0 = f"{BASE}?search={q}"
        page = await browser.get(url0)
        print(f"Warming on {url0} ({SOLVE_SECONDS}s for Cloudflare)...")
        await page.sleep(SOLVE_SECONDS)
        print(f"Consent: {await dismiss(page)}")

        # --- A. URL offset/limit on the SSR page ---
        _, h0 = await fetch_text(page, f"{BASE}?search={q}")
        _, h1 = await fetch_text(page, f"{BASE}?search={q}&offset=60")
        _, h2 = await fetch_text(page, f"{BASE}?search={q}&limit=120")
        a, b, c = page_item_ids(h0), page_item_ids(h1), page_item_ids(h2)
        overlap = len(set(a) & set(b))
        findings.append(f"page1 ids={len(a)}  offset=60 ids={len(b)} (overlap with page1={overlap})  limit=120 ids={len(c)}")
        findings.append(f"  -> offset works? {'YES (disjoint)' if b and overlap == 0 else 'no/ignored'}")
        findings.append(f"  -> limit works?  {'YES (>60)' if len(c) > 60 else 'no/ignored'}")

        # --- B. Trigger client load-more, capture cs.money /2.0/ XHRs ---
        # Infinite scroll only fires on GRADUAL downward scrolling — jumping to the
        # bottom skips the trigger. So step down in small wheel increments and watch
        # the item count grow.
        before = set(await js(page, "performance.getEntriesByType('resource').map(e=>e.name)") or [])
        async def card_count():
            n = await page.evaluate(
                "JSON.stringify(document.querySelectorAll('[href*=\"/item/\"],[class*=\"item\" i]').length)")
            return n
        print(f"  cards before scroll: {await card_count()}")
        for step in range(60):
            try:
                await page.send(cdp.input_.dispatch_mouse_event(
                    type_="mouseWheel", x=720, y=450, delta_x=0, delta_y=500))
            except Exception:
                pass
            await page.sleep(0.7)
            if step % 15 == 14:
                now = [u for u in (await js(page, "performance.getEntriesByType('resource').map(e=>e.name)") or [])
                       if u not in before and "cs.money" in u and "metrics." not in u and "traces." not in u]
                print(f"  step {step+1}: cards={await card_count()} new cs.money reqs={len(now)}")
        after = await js(page, "performance.getEntriesByType('resource').map(e=>e.name)") or []
        new_xhrs = [u for u in after if u not in before and "cs.money" in u
                    and "metrics." not in u and "traces." not in u]
        findings.append(f"\nclient requests after scrolling ({len(new_xhrs)} new cs.money):")
        findings.extend(f"  {u}" for u in dict.fromkeys(new_xhrs))
        if not new_xhrs:
            findings.append("  (none — grid may not lazy-load via XHR, or scroll didn't reach the trigger)")

        report = "\n".join(findings)
        print("\n=== FINDINGS ===\n" + report)
        (OUT / "_pagination.txt").write_text(f"search: {SEARCH}\n\n{report}\n", encoding="utf-8")
        print(f"\nsaved to {OUT / '_pagination.txt'}")
    finally:
        browser.stop()


if __name__ == "__main__":
    uc.loop().run_until_complete(main())