Add cs.money worker stack with per-worker IPRoyal residential proxy
Brings up the pull-model scraper: the .NET C2 hands skin+wear jobs to Python nodriver workers that scrape cs.money and post results back, plus the supporting Core/EFCore data model, migrations, and docker-compose orchestration. IPRoyal proxying lets workers scale horizontally with a distinct residential exit IP each: every worker process mints its own sticky session at startup, and an in-process forwarding proxy injects the gateway auth so Chromium talks only to an auth-free localhost endpoint (zero CDP). On a Cloudflare challenge a worker rotates to a fresh session/IP and re-warms. Verified end-to-end against live IPRoyal: distinct US residential exits per worker and IP rotation on demand. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,211 @@
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using BlueLaminate.Scraper.Browser;
|
||||
using BlueLaminate.Scraper.Proxies;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using OpenQA.Selenium;
|
||||
|
||||
namespace BlueLaminate.Scraper.CsMoney;
|
||||
|
||||
/// <summary>Outcome of a stealth pagination run.</summary>
|
||||
/// <param name="PagesSucceeded">How many offset pages returned listings JSON before stopping.</param>
|
||||
/// <param name="ItemsTotal">Total listing items captured across those pages.</param>
|
||||
/// <param name="StoppedReason">Why pagination stopped: "challenged", "empty", "completed", or "error".</param>
|
||||
public sealed record CsMoneyCaptureResult(int PagesSucceeded, int ItemsTotal, string StoppedReason);
|
||||
|
||||
/// <summary>
|
||||
/// Drives a low-fingerprint, non-headless Edge (no CDP) through a local forwarding
|
||||
/// proxy to the cs.money market, lets the operator clear Cloudflare once, then pages
|
||||
/// the listings API with human-like pacing using in-page <c>fetch()</c> calls from
|
||||
/// the cleared origin (so the cf_clearance cookie rides along). It records each
|
||||
/// page's JSON and — crucially for the current phase — <b>measures how many pages
|
||||
/// survive before Cloudflare re-challenges</b>, which tells us whether the
|
||||
/// fingerprint reductions are enough for a real sweep.
|
||||
/// </summary>
|
||||
public sealed class CsMoneyCaptureService
|
||||
{
|
||||
private readonly IProxyProvider _provider;
|
||||
private readonly LocalForwardingProxyFactory _proxyFactory;
|
||||
private readonly BrowserDriverFactory _factory;
|
||||
private readonly CsMoneyOptions _options;
|
||||
private readonly ILogger<CsMoneyCaptureService> _logger;
|
||||
|
||||
public CsMoneyCaptureService(
|
||||
IProxyProvider provider,
|
||||
LocalForwardingProxyFactory proxyFactory,
|
||||
BrowserDriverFactory factory,
|
||||
CsMoneyOptions options,
|
||||
ILogger<CsMoneyCaptureService> logger)
|
||||
{
|
||||
_provider = provider;
|
||||
_proxyFactory = proxyFactory;
|
||||
_factory = factory;
|
||||
_options = options;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Open the market, wait for <paramref name="browseUntilDone"/> (the operator
|
||||
/// clears Cloudflare and presses Enter), then page the listings API up to
|
||||
/// <paramref name="maxPages"/> times, stopping early on a re-challenge or an
|
||||
/// empty page. Each page's body is written to <paramref name="outputDir"/>.
|
||||
/// </summary>
|
||||
public async Task<CsMoneyCaptureResult> RunAsync(
|
||||
string outputDir,
|
||||
ProxyRequest request,
|
||||
bool loadImages,
|
||||
bool useProxy,
|
||||
int maxPages,
|
||||
Func<Task> browseUntilDone,
|
||||
CancellationToken ct = default)
|
||||
{
|
||||
Directory.CreateDirectory(outputDir);
|
||||
|
||||
// --no-proxy (useProxy=false) drives the automated browser on the machine's
|
||||
// own IP, to isolate whether a re-challenge is the IPRoyal exit's reputation
|
||||
// or the webdriver fingerprint itself.
|
||||
LocalForwardingProxy? localProxy = null;
|
||||
string? proxyEndpoint = null;
|
||||
if (useProxy)
|
||||
{
|
||||
var lease = _provider.Acquire(request);
|
||||
localProxy = _proxyFactory.Create(lease).Start();
|
||||
proxyEndpoint = localProxy.Endpoint;
|
||||
}
|
||||
|
||||
var driver = _factory.Create(proxyEndpoint, blockImages: !loadImages, _options.ProfileDir);
|
||||
|
||||
var pages = 0;
|
||||
var items = 0;
|
||||
var reason = "completed";
|
||||
try
|
||||
{
|
||||
driver.Manage().Timeouts().PageLoad = TimeSpan.FromSeconds(90);
|
||||
driver.Manage().Timeouts().AsynchronousJavaScript = TimeSpan.FromSeconds(45);
|
||||
|
||||
_logger.LogInformation("Navigating to {Url}", _options.MarketUrl);
|
||||
driver.Navigate().GoToUrl(_options.MarketUrl);
|
||||
|
||||
// Operator clears the Cloudflare challenge in the visible window, waits
|
||||
// until the market grid is actually rendered, then presses Enter.
|
||||
await browseUntilDone();
|
||||
|
||||
for (var offset = 0; pages < maxPages; offset += 60)
|
||||
{
|
||||
ct.ThrowIfCancellationRequested();
|
||||
|
||||
var apiUrl = string.Format(_options.ApiUrlTemplate, offset);
|
||||
var (status, body) = DirectFetch(driver, apiUrl);
|
||||
|
||||
if (LooksLikeChallenge(status, body))
|
||||
{
|
||||
_logger.LogWarning(
|
||||
"Re-challenged at offset {Offset} (after {Pages} clean page(s)). Stopping.",
|
||||
offset, pages);
|
||||
await WriteAsync(outputDir, $"challenge_offset_{offset}.html", body, ct);
|
||||
reason = "challenged";
|
||||
break;
|
||||
}
|
||||
|
||||
var count = TryCountItems(body);
|
||||
if (count is 0)
|
||||
{
|
||||
_logger.LogInformation("Offset {Offset} returned no items — end of listings.", offset);
|
||||
reason = "empty";
|
||||
break;
|
||||
}
|
||||
|
||||
await WriteAsync(outputDir, $"page_{pages:D3}_offset_{offset}.json", body, ct);
|
||||
pages++;
|
||||
items += count ?? 0;
|
||||
_logger.LogInformation(
|
||||
"Page {Page} [offset {Offset}] [{Status}] → {Count} items ({Bytes} bytes).",
|
||||
pages, offset, status, count, body.Length);
|
||||
|
||||
await DelayAsync(ct);
|
||||
}
|
||||
}
|
||||
catch (OperationCanceledException)
|
||||
{
|
||||
reason = "cancelled";
|
||||
throw;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "cs.money capture failed after {Pages} page(s).", pages);
|
||||
reason = "error";
|
||||
}
|
||||
finally
|
||||
{
|
||||
driver.Quit();
|
||||
if (localProxy is not null)
|
||||
{
|
||||
await localProxy.DisposeAsync();
|
||||
}
|
||||
}
|
||||
|
||||
return new CsMoneyCaptureResult(pages, items, reason);
|
||||
}
|
||||
|
||||
// Run a same-origin fetch() in the cleared page and return (status, body). Uses
|
||||
// ExecuteAsyncScript so we can await the fetch promise; the page is on the
|
||||
// cs.money origin, so the cf_clearance cookie is sent automatically.
|
||||
private (int Status, string Body) DirectFetch(IWebDriver driver, string apiUrl)
|
||||
{
|
||||
const string script = """
|
||||
const url = arguments[0];
|
||||
const done = arguments[arguments.length - 1];
|
||||
fetch(url, { credentials: 'include', headers: { 'accept': 'application/json' } })
|
||||
.then(r => r.text().then(t => done(JSON.stringify({ status: r.status, body: t }))))
|
||||
.catch(e => done(JSON.stringify({ status: -1, body: String(e) })));
|
||||
""";
|
||||
var raw = ((IJavaScriptExecutor)driver).ExecuteAsyncScript(script, apiUrl) as string;
|
||||
if (string.IsNullOrEmpty(raw))
|
||||
{
|
||||
return (-1, "");
|
||||
}
|
||||
|
||||
using var doc = JsonDocument.Parse(raw);
|
||||
var status = doc.RootElement.GetProperty("status").GetInt32();
|
||||
var body = doc.RootElement.GetProperty("body").GetString() ?? "";
|
||||
return (status, body);
|
||||
}
|
||||
|
||||
private static bool LooksLikeChallenge(int status, string body) =>
|
||||
status is 403 or 503 or -1
|
||||
|| body.Contains("Just a moment", StringComparison.OrdinalIgnoreCase)
|
||||
|| body.Contains("challenge-platform", StringComparison.OrdinalIgnoreCase)
|
||||
|| body.TrimStart().StartsWith("<", StringComparison.Ordinal); // HTML, not JSON
|
||||
|
||||
// Count items[] without binding a full model (the typed model is Phase 2).
|
||||
private static int? TryCountItems(string body)
|
||||
{
|
||||
try
|
||||
{
|
||||
using var doc = JsonDocument.Parse(body);
|
||||
return doc.RootElement.TryGetProperty("items", out var items)
|
||||
&& items.ValueKind == JsonValueKind.Array
|
||||
? items.GetArrayLength()
|
||||
: null;
|
||||
}
|
||||
catch (JsonException)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private async Task DelayAsync(CancellationToken ct)
|
||||
{
|
||||
var jitter = _options.PageJitterSeconds > 0
|
||||
? Random.Shared.NextDouble() * _options.PageJitterSeconds
|
||||
: 0;
|
||||
var seconds = Math.Max(0, _options.PageDelaySeconds) + jitter;
|
||||
if (seconds > 0)
|
||||
{
|
||||
await Task.Delay(TimeSpan.FromSeconds(seconds), ct);
|
||||
}
|
||||
}
|
||||
|
||||
private static async Task WriteAsync(string dir, string fileName, string body, CancellationToken ct) =>
|
||||
await File.WriteAllTextAsync(Path.Combine(dir, fileName), body, Encoding.UTF8, ct);
|
||||
}
|
||||
Reference in New Issue
Block a user