Brings up the pull-model scraper: the .NET C2 hands skin+wear jobs to Python nodriver workers that scrape cs.money and post results back, plus the supporting Core/EFCore data model, migrations, and docker-compose orchestration. IPRoyal proxying lets workers scale horizontally with a distinct residential exit IP each: every worker process mints its own sticky session at startup, and an in-process forwarding proxy injects the gateway auth so Chromium talks only to an auth-free localhost endpoint (zero CDP). On a Cloudflare challenge a worker rotates to a fresh session/IP and re-warms. Verified end-to-end against live IPRoyal: distinct US residential exits per worker and IP rotation on demand. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
212 lines
8.3 KiB
C#
212 lines
8.3 KiB
C#
using System.Text;
|
|
using System.Text.Json;
|
|
using BlueLaminate.Scraper.Browser;
|
|
using BlueLaminate.Scraper.Proxies;
|
|
using Microsoft.Extensions.Logging;
|
|
using OpenQA.Selenium;
|
|
|
|
namespace BlueLaminate.Scraper.CsMoney;
|
|
|
|
/// <summary>Outcome of a stealth pagination run.</summary>
|
|
/// <param name="PagesSucceeded">How many offset pages returned listings JSON before stopping.</param>
|
|
/// <param name="ItemsTotal">Total listing items captured across those pages.</param>
|
|
/// <param name="StoppedReason">Why pagination stopped: "challenged", "empty", "completed", or "error".</param>
|
|
public sealed record CsMoneyCaptureResult(int PagesSucceeded, int ItemsTotal, string StoppedReason);
|
|
|
|
/// <summary>
|
|
/// Drives a low-fingerprint, non-headless Edge (no CDP) through a local forwarding
|
|
/// proxy to the cs.money market, lets the operator clear Cloudflare once, then pages
|
|
/// the listings API with human-like pacing using in-page <c>fetch()</c> calls from
|
|
/// the cleared origin (so the cf_clearance cookie rides along). It records each
|
|
/// page's JSON and — crucially for the current phase — <b>measures how many pages
|
|
/// survive before Cloudflare re-challenges</b>, which tells us whether the
|
|
/// fingerprint reductions are enough for a real sweep.
|
|
/// </summary>
|
|
public sealed class CsMoneyCaptureService
|
|
{
|
|
private readonly IProxyProvider _provider;
|
|
private readonly LocalForwardingProxyFactory _proxyFactory;
|
|
private readonly BrowserDriverFactory _factory;
|
|
private readonly CsMoneyOptions _options;
|
|
private readonly ILogger<CsMoneyCaptureService> _logger;
|
|
|
|
public CsMoneyCaptureService(
|
|
IProxyProvider provider,
|
|
LocalForwardingProxyFactory proxyFactory,
|
|
BrowserDriverFactory factory,
|
|
CsMoneyOptions options,
|
|
ILogger<CsMoneyCaptureService> logger)
|
|
{
|
|
_provider = provider;
|
|
_proxyFactory = proxyFactory;
|
|
_factory = factory;
|
|
_options = options;
|
|
_logger = logger;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Open the market, wait for <paramref name="browseUntilDone"/> (the operator
|
|
/// clears Cloudflare and presses Enter), then page the listings API up to
|
|
/// <paramref name="maxPages"/> times, stopping early on a re-challenge or an
|
|
/// empty page. Each page's body is written to <paramref name="outputDir"/>.
|
|
/// </summary>
|
|
public async Task<CsMoneyCaptureResult> RunAsync(
|
|
string outputDir,
|
|
ProxyRequest request,
|
|
bool loadImages,
|
|
bool useProxy,
|
|
int maxPages,
|
|
Func<Task> browseUntilDone,
|
|
CancellationToken ct = default)
|
|
{
|
|
Directory.CreateDirectory(outputDir);
|
|
|
|
// --no-proxy (useProxy=false) drives the automated browser on the machine's
|
|
// own IP, to isolate whether a re-challenge is the IPRoyal exit's reputation
|
|
// or the webdriver fingerprint itself.
|
|
LocalForwardingProxy? localProxy = null;
|
|
string? proxyEndpoint = null;
|
|
if (useProxy)
|
|
{
|
|
var lease = _provider.Acquire(request);
|
|
localProxy = _proxyFactory.Create(lease).Start();
|
|
proxyEndpoint = localProxy.Endpoint;
|
|
}
|
|
|
|
var driver = _factory.Create(proxyEndpoint, blockImages: !loadImages, _options.ProfileDir);
|
|
|
|
var pages = 0;
|
|
var items = 0;
|
|
var reason = "completed";
|
|
try
|
|
{
|
|
driver.Manage().Timeouts().PageLoad = TimeSpan.FromSeconds(90);
|
|
driver.Manage().Timeouts().AsynchronousJavaScript = TimeSpan.FromSeconds(45);
|
|
|
|
_logger.LogInformation("Navigating to {Url}", _options.MarketUrl);
|
|
driver.Navigate().GoToUrl(_options.MarketUrl);
|
|
|
|
// Operator clears the Cloudflare challenge in the visible window, waits
|
|
// until the market grid is actually rendered, then presses Enter.
|
|
await browseUntilDone();
|
|
|
|
for (var offset = 0; pages < maxPages; offset += 60)
|
|
{
|
|
ct.ThrowIfCancellationRequested();
|
|
|
|
var apiUrl = string.Format(_options.ApiUrlTemplate, offset);
|
|
var (status, body) = DirectFetch(driver, apiUrl);
|
|
|
|
if (LooksLikeChallenge(status, body))
|
|
{
|
|
_logger.LogWarning(
|
|
"Re-challenged at offset {Offset} (after {Pages} clean page(s)). Stopping.",
|
|
offset, pages);
|
|
await WriteAsync(outputDir, $"challenge_offset_{offset}.html", body, ct);
|
|
reason = "challenged";
|
|
break;
|
|
}
|
|
|
|
var count = TryCountItems(body);
|
|
if (count is 0)
|
|
{
|
|
_logger.LogInformation("Offset {Offset} returned no items — end of listings.", offset);
|
|
reason = "empty";
|
|
break;
|
|
}
|
|
|
|
await WriteAsync(outputDir, $"page_{pages:D3}_offset_{offset}.json", body, ct);
|
|
pages++;
|
|
items += count ?? 0;
|
|
_logger.LogInformation(
|
|
"Page {Page} [offset {Offset}] [{Status}] → {Count} items ({Bytes} bytes).",
|
|
pages, offset, status, count, body.Length);
|
|
|
|
await DelayAsync(ct);
|
|
}
|
|
}
|
|
catch (OperationCanceledException)
|
|
{
|
|
reason = "cancelled";
|
|
throw;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "cs.money capture failed after {Pages} page(s).", pages);
|
|
reason = "error";
|
|
}
|
|
finally
|
|
{
|
|
driver.Quit();
|
|
if (localProxy is not null)
|
|
{
|
|
await localProxy.DisposeAsync();
|
|
}
|
|
}
|
|
|
|
return new CsMoneyCaptureResult(pages, items, reason);
|
|
}
|
|
|
|
// Run a same-origin fetch() in the cleared page and return (status, body). Uses
|
|
// ExecuteAsyncScript so we can await the fetch promise; the page is on the
|
|
// cs.money origin, so the cf_clearance cookie is sent automatically.
|
|
private (int Status, string Body) DirectFetch(IWebDriver driver, string apiUrl)
|
|
{
|
|
const string script = """
|
|
const url = arguments[0];
|
|
const done = arguments[arguments.length - 1];
|
|
fetch(url, { credentials: 'include', headers: { 'accept': 'application/json' } })
|
|
.then(r => r.text().then(t => done(JSON.stringify({ status: r.status, body: t }))))
|
|
.catch(e => done(JSON.stringify({ status: -1, body: String(e) })));
|
|
""";
|
|
var raw = ((IJavaScriptExecutor)driver).ExecuteAsyncScript(script, apiUrl) as string;
|
|
if (string.IsNullOrEmpty(raw))
|
|
{
|
|
return (-1, "");
|
|
}
|
|
|
|
using var doc = JsonDocument.Parse(raw);
|
|
var status = doc.RootElement.GetProperty("status").GetInt32();
|
|
var body = doc.RootElement.GetProperty("body").GetString() ?? "";
|
|
return (status, body);
|
|
}
|
|
|
|
private static bool LooksLikeChallenge(int status, string body) =>
|
|
status is 403 or 503 or -1
|
|
|| body.Contains("Just a moment", StringComparison.OrdinalIgnoreCase)
|
|
|| body.Contains("challenge-platform", StringComparison.OrdinalIgnoreCase)
|
|
|| body.TrimStart().StartsWith("<", StringComparison.Ordinal); // HTML, not JSON
|
|
|
|
// Count items[] without binding a full model (the typed model is Phase 2).
|
|
private static int? TryCountItems(string body)
|
|
{
|
|
try
|
|
{
|
|
using var doc = JsonDocument.Parse(body);
|
|
return doc.RootElement.TryGetProperty("items", out var items)
|
|
&& items.ValueKind == JsonValueKind.Array
|
|
? items.GetArrayLength()
|
|
: null;
|
|
}
|
|
catch (JsonException)
|
|
{
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private async Task DelayAsync(CancellationToken ct)
|
|
{
|
|
var jitter = _options.PageJitterSeconds > 0
|
|
? Random.Shared.NextDouble() * _options.PageJitterSeconds
|
|
: 0;
|
|
var seconds = Math.Max(0, _options.PageDelaySeconds) + jitter;
|
|
if (seconds > 0)
|
|
{
|
|
await Task.Delay(TimeSpan.FromSeconds(seconds), ct);
|
|
}
|
|
}
|
|
|
|
private static async Task WriteAsync(string dir, string fileName, string body, CancellationToken ct) =>
|
|
await File.WriteAllTextAsync(Path.Combine(dir, fileName), body, Encoding.UTF8, ct);
|
|
}
|