Add cs.money worker stack with per-worker IPRoyal residential proxy

Brings up the pull-model scraper: the .NET C2 hands skin+wear jobs to Python nodriver workers that scrape cs.money and post results back, plus the supporting Core/EFCore data model, migrations, and docker-compose orchestration.

IPRoyal proxying lets workers scale horizontally with a distinct residential exit IP each: every worker process mints its own sticky session at startup, and an in-process forwarding proxy injects the gateway auth so Chromium talks only to an auth-free localhost endpoint (zero CDP). On a Cloudflare challenge a worker rotates to a fresh session/IP and re-warms. Verified end-to-end against live IPRoyal: distinct US residential exits per worker and IP rotation on demand.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
bob
2026-05-31 15:03:31 -05:00
parent eb5fb0dac7
commit dc7c3f99ae
82 changed files with 8354 additions and 571 deletions

View File

@@ -0,0 +1,211 @@
using System.Text;
using System.Text.Json;
using BlueLaminate.Scraper.Browser;
using BlueLaminate.Scraper.Proxies;
using Microsoft.Extensions.Logging;
using OpenQA.Selenium;
namespace BlueLaminate.Scraper.CsMoney;
/// <summary>Outcome of a stealth pagination run.</summary>
/// <param name="PagesSucceeded">How many offset pages returned listings JSON before stopping.</param>
/// <param name="ItemsTotal">Total listing items captured across those pages.</param>
/// <param name="StoppedReason">Why pagination stopped: "challenged", "empty", "completed", or "error".</param>
public sealed record CsMoneyCaptureResult(int PagesSucceeded, int ItemsTotal, string StoppedReason);
/// <summary>
/// Drives a low-fingerprint, non-headless Edge (no CDP) through a local forwarding
/// proxy to the cs.money market, lets the operator clear Cloudflare once, then pages
/// the listings API with human-like pacing using in-page <c>fetch()</c> calls from
/// the cleared origin (so the cf_clearance cookie rides along). It records each
/// page's JSON and — crucially for the current phase — <b>measures how many pages
/// survive before Cloudflare re-challenges</b>, which tells us whether the
/// fingerprint reductions are enough for a real sweep.
/// </summary>
public sealed class CsMoneyCaptureService
{
private readonly IProxyProvider _provider;
private readonly LocalForwardingProxyFactory _proxyFactory;
private readonly BrowserDriverFactory _factory;
private readonly CsMoneyOptions _options;
private readonly ILogger<CsMoneyCaptureService> _logger;
public CsMoneyCaptureService(
IProxyProvider provider,
LocalForwardingProxyFactory proxyFactory,
BrowserDriverFactory factory,
CsMoneyOptions options,
ILogger<CsMoneyCaptureService> logger)
{
_provider = provider;
_proxyFactory = proxyFactory;
_factory = factory;
_options = options;
_logger = logger;
}
/// <summary>
/// Open the market, wait for <paramref name="browseUntilDone"/> (the operator
/// clears Cloudflare and presses Enter), then page the listings API up to
/// <paramref name="maxPages"/> times, stopping early on a re-challenge or an
/// empty page. Each page's body is written to <paramref name="outputDir"/>.
/// </summary>
public async Task<CsMoneyCaptureResult> RunAsync(
string outputDir,
ProxyRequest request,
bool loadImages,
bool useProxy,
int maxPages,
Func<Task> browseUntilDone,
CancellationToken ct = default)
{
Directory.CreateDirectory(outputDir);
// --no-proxy (useProxy=false) drives the automated browser on the machine's
// own IP, to isolate whether a re-challenge is the IPRoyal exit's reputation
// or the webdriver fingerprint itself.
LocalForwardingProxy? localProxy = null;
string? proxyEndpoint = null;
if (useProxy)
{
var lease = _provider.Acquire(request);
localProxy = _proxyFactory.Create(lease).Start();
proxyEndpoint = localProxy.Endpoint;
}
var driver = _factory.Create(proxyEndpoint, blockImages: !loadImages, _options.ProfileDir);
var pages = 0;
var items = 0;
var reason = "completed";
try
{
driver.Manage().Timeouts().PageLoad = TimeSpan.FromSeconds(90);
driver.Manage().Timeouts().AsynchronousJavaScript = TimeSpan.FromSeconds(45);
_logger.LogInformation("Navigating to {Url}", _options.MarketUrl);
driver.Navigate().GoToUrl(_options.MarketUrl);
// Operator clears the Cloudflare challenge in the visible window, waits
// until the market grid is actually rendered, then presses Enter.
await browseUntilDone();
for (var offset = 0; pages < maxPages; offset += 60)
{
ct.ThrowIfCancellationRequested();
var apiUrl = string.Format(_options.ApiUrlTemplate, offset);
var (status, body) = DirectFetch(driver, apiUrl);
if (LooksLikeChallenge(status, body))
{
_logger.LogWarning(
"Re-challenged at offset {Offset} (after {Pages} clean page(s)). Stopping.",
offset, pages);
await WriteAsync(outputDir, $"challenge_offset_{offset}.html", body, ct);
reason = "challenged";
break;
}
var count = TryCountItems(body);
if (count is 0)
{
_logger.LogInformation("Offset {Offset} returned no items — end of listings.", offset);
reason = "empty";
break;
}
await WriteAsync(outputDir, $"page_{pages:D3}_offset_{offset}.json", body, ct);
pages++;
items += count ?? 0;
_logger.LogInformation(
"Page {Page} [offset {Offset}] [{Status}] → {Count} items ({Bytes} bytes).",
pages, offset, status, count, body.Length);
await DelayAsync(ct);
}
}
catch (OperationCanceledException)
{
reason = "cancelled";
throw;
}
catch (Exception ex)
{
_logger.LogError(ex, "cs.money capture failed after {Pages} page(s).", pages);
reason = "error";
}
finally
{
driver.Quit();
if (localProxy is not null)
{
await localProxy.DisposeAsync();
}
}
return new CsMoneyCaptureResult(pages, items, reason);
}
// Run a same-origin fetch() in the cleared page and return (status, body). Uses
// ExecuteAsyncScript so we can await the fetch promise; the page is on the
// cs.money origin, so the cf_clearance cookie is sent automatically.
private (int Status, string Body) DirectFetch(IWebDriver driver, string apiUrl)
{
const string script = """
const url = arguments[0];
const done = arguments[arguments.length - 1];
fetch(url, { credentials: 'include', headers: { 'accept': 'application/json' } })
.then(r => r.text().then(t => done(JSON.stringify({ status: r.status, body: t }))))
.catch(e => done(JSON.stringify({ status: -1, body: String(e) })));
""";
var raw = ((IJavaScriptExecutor)driver).ExecuteAsyncScript(script, apiUrl) as string;
if (string.IsNullOrEmpty(raw))
{
return (-1, "");
}
using var doc = JsonDocument.Parse(raw);
var status = doc.RootElement.GetProperty("status").GetInt32();
var body = doc.RootElement.GetProperty("body").GetString() ?? "";
return (status, body);
}
private static bool LooksLikeChallenge(int status, string body) =>
status is 403 or 503 or -1
|| body.Contains("Just a moment", StringComparison.OrdinalIgnoreCase)
|| body.Contains("challenge-platform", StringComparison.OrdinalIgnoreCase)
|| body.TrimStart().StartsWith("<", StringComparison.Ordinal); // HTML, not JSON
// Count items[] without binding a full model (the typed model is Phase 2).
private static int? TryCountItems(string body)
{
try
{
using var doc = JsonDocument.Parse(body);
return doc.RootElement.TryGetProperty("items", out var items)
&& items.ValueKind == JsonValueKind.Array
? items.GetArrayLength()
: null;
}
catch (JsonException)
{
return null;
}
}
private async Task DelayAsync(CancellationToken ct)
{
var jitter = _options.PageJitterSeconds > 0
? Random.Shared.NextDouble() * _options.PageJitterSeconds
: 0;
var seconds = Math.Max(0, _options.PageDelaySeconds) + jitter;
if (seconds > 0)
{
await Task.Delay(TimeSpan.FromSeconds(seconds), ct);
}
}
private static async Task WriteAsync(string dir, string fileName, string body, CancellationToken ct) =>
await File.WriteAllTextAsync(Path.Combine(dir, fileName), body, Encoding.UTF8, ct);
}

View File

@@ -0,0 +1,50 @@
namespace BlueLaminate.Scraper.CsMoney;
/// <summary>
/// Configuration for the cs.money scraper, bound from the <c>CsMoney</c>
/// configuration section.
/// <para>
/// cs.money exposes no public API and sits behind Cloudflare bot protection, so we
/// drive a real, non-headless browser (Selenium/Edge) routed through an IPRoyal
/// residential proxy via a local forwarding hop (no CDP). The market endpoint
/// re-challenges aggressively during pagination, so these options also tune the
/// warmed profile and request pacing we use to survive longer.
/// </para>
/// </summary>
public sealed class CsMoneyOptions
{
public const string SectionName = "CsMoney";
/// <summary>Public market page the browser opens (and where the operator clears Cloudflare).</summary>
public string MarketUrl { get; set; } = "https://cs.money/market/buy/";
/// <summary>
/// Listings API template; <c>{0}</c> is the page offset (steps of 60). Fetched
/// in-page from the cleared market origin so the cf_clearance cookie is sent.
/// </summary>
public string ApiUrlTemplate { get; set; } =
"https://cs.money/2.0/market/sell-orders?limit=60&offset={0}";
/// <summary>
/// Persistent Chromium profile directory. Reusing one profile keeps the
/// cf_clearance cookie and history between runs — a warmed profile is far less
/// likely to be re-challenged than a fresh one. Empty = throwaway profile.
/// </summary>
public string ProfileDir { get; set; } =
Path.Combine(Path.GetTempPath(), "bluelaminate-csmoney-profile");
/// <summary>
/// Optional ISO country code(s) for the residential exit IP, e.g. "us". Null/empty
/// lets IPRoyal pick at random.
/// </summary>
public string? Country { get; set; }
/// <summary>Load images. Off by default to conserve the metered residential plan.</summary>
public bool LoadImages { get; set; }
/// <summary>Base delay between paginated API fetches, in seconds (human-like pacing).</summary>
public double PageDelaySeconds { get; set; } = 2.5;
/// <summary>Extra random jitter added to each delay, in seconds (0..value).</summary>
public double PageJitterSeconds { get; set; } = 2.0;
}