Brings up the pull-model scraper: the .NET C2 hands skin+wear jobs to Python nodriver workers that scrape cs.money and post results back, plus the supporting Core/EFCore data model, migrations, and docker-compose orchestration. IPRoyal proxying lets workers scale horizontally with a distinct residential exit IP each: every worker process mints its own sticky session at startup, and an in-process forwarding proxy injects the gateway auth so Chromium talks only to an auth-free localhost endpoint (zero CDP). On a Cloudflare challenge a worker rotates to a fresh session/IP and re-warms. Verified end-to-end against live IPRoyal: distinct US residential exits per worker and IP rotation on demand. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
123 lines
4.9 KiB
C#
123 lines
4.9 KiB
C#
using BlueLaminate.Scraper.CsMoney;
|
|
using BlueLaminate.Scraper.Proxies;
|
|
using Microsoft.Extensions.DependencyInjection;
|
|
using Microsoft.Extensions.Hosting;
|
|
using Microsoft.Extensions.Options;
|
|
using System.CommandLine;
|
|
|
|
namespace BlueLaminate.Cli.Commands;
|
|
|
|
/// <summary>
|
|
/// <c>capture-csmoney</c>: open the cs.money market through the IPRoyal residential
|
|
/// proxy (local forwarding hop, no CDP) in a real, non-headless browser. You clear
|
|
/// the Cloudflare challenge once; the tool then pages the listings API from inside
|
|
/// the cleared page with human-like pacing, dumping each page's JSON and reporting
|
|
/// how many pages survive before a re-challenge. Discovery/measurement tool — writes
|
|
/// nothing to the database. Reads IPROYAL_USERNAME / IPROYAL_PASSWORD.
|
|
/// </summary>
|
|
internal static class CaptureCsMoneyCommand
|
|
{
|
|
public static Command Build(IHost host)
|
|
{
|
|
var countryOption = new Option<string?>("--country")
|
|
{
|
|
Description = "ISO country code(s) for the exit IP, e.g. \"us\". Default: configured/random.",
|
|
};
|
|
var loadImagesOption = new Option<bool>("--load-images")
|
|
{
|
|
Description = "Load images (uses more bandwidth). Default off to conserve the metered plan.",
|
|
};
|
|
var pagesOption = new Option<int>("--pages")
|
|
{
|
|
Description = "Maximum offset pages (60 items each) to fetch before stopping.",
|
|
DefaultValueFactory = _ => 50,
|
|
};
|
|
var noProxyOption = new Option<bool>("--no-proxy")
|
|
{
|
|
Description = "Diagnostic: drive the browser on this machine's own IP (no IPRoyal proxy), "
|
|
+ "to isolate whether re-challenges are IP reputation vs. the webdriver fingerprint.",
|
|
};
|
|
var outOption = new Option<string>("--out")
|
|
{
|
|
Description = "Directory to write captured JSON pages to.",
|
|
DefaultValueFactory = _ => "csmoney-captures",
|
|
};
|
|
|
|
var command = new Command(
|
|
"capture-csmoney",
|
|
"Open the cs.money market through the residential proxy, clear Cloudflare once, then page "
|
|
+ "the listings API with pacing and report how many pages survive. Discovery/measurement "
|
|
+ "tool — writes nothing to the database. Reads IPROYAL_USERNAME / IPROYAL_PASSWORD.")
|
|
{
|
|
countryOption,
|
|
loadImagesOption,
|
|
pagesOption,
|
|
outOption,
|
|
noProxyOption,
|
|
};
|
|
|
|
command.SetAction((parseResult, ct) => RunAsync(
|
|
host,
|
|
parseResult.GetValue(countryOption),
|
|
parseResult.GetValue(loadImagesOption),
|
|
parseResult.GetValue(pagesOption),
|
|
parseResult.GetValue(outOption)!,
|
|
parseResult.GetValue(noProxyOption),
|
|
ct));
|
|
|
|
return command;
|
|
}
|
|
|
|
private static async Task<int> RunAsync(
|
|
IHost host, string? country, bool loadImages, int pages, string outDir, bool noProxy,
|
|
CancellationToken ct)
|
|
{
|
|
using var scope = host.Services.CreateScope();
|
|
var options = scope.ServiceProvider.GetRequiredService<IOptions<CsMoneyOptions>>().Value;
|
|
|
|
var exitCountry = string.IsNullOrWhiteSpace(country) ? options.Country : country;
|
|
var images = loadImages || options.LoadImages;
|
|
|
|
Console.WriteLine($"Opening {options.MarketUrl}{(noProxy ? " (DIRECT — no proxy)" : "")}");
|
|
Console.WriteLine(
|
|
"Solve any Cloudflare challenge in the window and wait until the market grid "
|
|
+ "(items + prices) is actually visible — that means the session is cleared.");
|
|
Console.WriteLine(
|
|
$"Press Enter here once it's visible. The tool then pages up to {pages} page(s) of "
|
|
+ "listings from inside the cleared page and reports how far it gets.");
|
|
|
|
try
|
|
{
|
|
var capture = scope.ServiceProvider.GetRequiredService<CsMoneyCaptureService>();
|
|
|
|
// Block until the operator presses Enter; the browser stays open the whole
|
|
// time. ReadLine is sync, so push it off-thread.
|
|
var result = await capture.RunAsync(
|
|
outDir,
|
|
new ProxyRequest(Country: exitCountry, Sticky: true),
|
|
images,
|
|
useProxy: !noProxy,
|
|
pages,
|
|
() => Task.Run(() => Console.ReadLine(), ct),
|
|
ct);
|
|
|
|
var full = Path.GetFullPath(outDir);
|
|
Console.WriteLine();
|
|
Console.WriteLine(
|
|
$"Stopped: {result.StoppedReason}. {result.PagesSucceeded} page(s), "
|
|
+ $"{result.ItemsTotal} item(s) → {full}");
|
|
return result.PagesSucceeded > 0 ? 0 : 1;
|
|
}
|
|
catch (OperationCanceledException)
|
|
{
|
|
Console.Error.WriteLine("Capture cancelled.");
|
|
return 130;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
Console.Error.WriteLine($"cs.money capture failed: {ex.Message}");
|
|
return 1;
|
|
}
|
|
}
|
|
}
|