140 lines
5.2 KiB
C#
140 lines
5.2 KiB
C#
using System.Text;
|
|
using BlueLaminate.Scraper.Browser;
|
|
using BlueLaminate.Scraper.Proxies;
|
|
using Microsoft.Extensions.Logging;
|
|
using OpenQA.Selenium;
|
|
|
|
namespace BlueLaminate.Scraper.CsFloat;
|
|
|
|
/// <summary>
|
|
/// Phase-B discovery tool. Drives a real Edge browser through a residential
|
|
/// lease to a CSFloat search page, then records every CSFloat <c>/api/</c> JSON
|
|
/// response to disk while a human clicks around (open a listing → "Latest
|
|
/// Sales"). We don't yet know CSFloat's exact endpoints or DOM selectors, so a
|
|
/// human-in-the-loop is the cheapest way to surface the real traffic: the tool
|
|
/// just listens and dumps, the operator drives the UI in the visible window.
|
|
/// Once we can see the captured shapes we can automate navigation and design the
|
|
/// tables.
|
|
/// </summary>
|
|
public sealed class CsFloatCaptureService
|
|
{
|
|
private readonly IProxyProvider _provider;
|
|
private readonly BrowserDriverFactory _factory;
|
|
private readonly ILogger<CsFloatCaptureService> _logger;
|
|
|
|
public CsFloatCaptureService(
|
|
IProxyProvider provider,
|
|
BrowserDriverFactory factory,
|
|
ILogger<CsFloatCaptureService> logger)
|
|
{
|
|
_provider = provider;
|
|
_factory = factory;
|
|
_logger = logger;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Opens <paramref name="url"/> through the proxy and captures CSFloat API
|
|
/// responses to <paramref name="outputDir"/> until <paramref name="browseUntilDone"/>
|
|
/// completes (the CLI ties that to the operator pressing Enter). When
|
|
/// <paramref name="diagnose"/> is true, every CSFloat-domain response is
|
|
/// logged (url + status + type) to reveal where a login wall appears.
|
|
/// Returns the number of responses written.
|
|
/// </summary>
|
|
public async Task<int> RunAsync(
|
|
string url,
|
|
string outputDir,
|
|
ProxyRequest request,
|
|
bool loadImages,
|
|
bool diagnose,
|
|
Func<Task> browseUntilDone)
|
|
{
|
|
Directory.CreateDirectory(outputDir);
|
|
|
|
var lease = _provider.Acquire(request);
|
|
var driver = await _factory.CreateAsync(lease, blockImages: !loadImages);
|
|
|
|
var captured = 0;
|
|
|
|
void OnResponse(object? sender, NetworkResponseReceivedEventArgs e)
|
|
{
|
|
var responseUrl = e.ResponseUrl;
|
|
if (string.IsNullOrEmpty(responseUrl)
|
|
|| !responseUrl.Contains("csfloat", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
return;
|
|
}
|
|
|
|
// Diagnose mode logs every CSFloat-domain response — including the
|
|
// SPA shell, redirects and any 401/403 — so we can see exactly where
|
|
// a Steam-login wall appears even before any /api/ call fires.
|
|
if (diagnose)
|
|
{
|
|
_logger.LogInformation("[{Status}] {Type} {Url}",
|
|
e.ResponseStatusCode, e.ResponseResourceType, responseUrl);
|
|
}
|
|
|
|
// Only JSON API calls get written to disk; skip the shell, images,
|
|
// fonts, analytics, etc. Matches both api.csfloat.com and csfloat.com/api.
|
|
if (!responseUrl.Contains("/api/", StringComparison.OrdinalIgnoreCase))
|
|
return;
|
|
|
|
var body = e.ResponseBody;
|
|
if (string.IsNullOrWhiteSpace(body))
|
|
{
|
|
// Body wasn't buffered (e.g. the known Fetch interception race).
|
|
// Log the endpoint so we still learn it exists even if empty.
|
|
_logger.LogWarning("No body captured for {Url} (status {Status}).",
|
|
responseUrl, e.ResponseStatusCode);
|
|
return;
|
|
}
|
|
|
|
try
|
|
{
|
|
var n = Interlocked.Increment(ref captured);
|
|
var fileName = $"{n:D3}_{Sanitize(responseUrl)}.json";
|
|
File.WriteAllText(Path.Combine(outputDir, fileName), body, Encoding.UTF8);
|
|
_logger.LogInformation(
|
|
"Captured #{N} [{Status}] {Url} → {File} ({Bytes} bytes).",
|
|
n, e.ResponseStatusCode, responseUrl, fileName, body.Length);
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogWarning(ex, "Failed to write capture for {Url}.", responseUrl);
|
|
}
|
|
}
|
|
|
|
var network = driver.Manage().Network;
|
|
network.NetworkResponseReceived += OnResponse;
|
|
|
|
try
|
|
{
|
|
_logger.LogInformation("Navigating to {Url}", url);
|
|
driver.Navigate().GoToUrl(url);
|
|
await browseUntilDone();
|
|
}
|
|
finally
|
|
{
|
|
network.NetworkResponseReceived -= OnResponse;
|
|
driver.Quit();
|
|
}
|
|
|
|
return captured;
|
|
}
|
|
|
|
// Turn a URL into a filesystem-safe, readable, length-capped file stem so the
|
|
// captures are self-describing (the endpoint is visible in the filename).
|
|
private static string Sanitize(string url)
|
|
{
|
|
var trimmed = url
|
|
.Replace("https://", "", StringComparison.OrdinalIgnoreCase)
|
|
.Replace("http://", "", StringComparison.OrdinalIgnoreCase);
|
|
|
|
var sb = new StringBuilder(trimmed.Length);
|
|
foreach (var c in trimmed)
|
|
sb.Append(char.IsLetterOrDigit(c) || c is '-' or '.' ? c : '_');
|
|
|
|
var stem = sb.ToString();
|
|
return stem.Length > 120 ? stem[..120] : stem;
|
|
}
|
|
}
|