Brings up the pull-model scraper: the .NET C2 hands skin+wear jobs to Python nodriver workers that scrape cs.money and post results back, plus the supporting Core/EFCore data model, migrations, and docker-compose orchestration. IPRoyal proxying lets workers scale horizontally with a distinct residential exit IP each: every worker process mints its own sticky session at startup, and an in-process forwarding proxy injects the gateway auth so Chromium talks only to an auth-free localhost endpoint (zero CDP). On a Cloudflare challenge a worker rotates to a fresh session/IP and re-warms. Verified end-to-end against live IPRoyal: distinct US residential exits per worker and IP rotation on demand. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
89 lines
3.5 KiB
C#
89 lines
3.5 KiB
C#
using System.Collections.Concurrent;
|
|
using BlueLaminate.Core.CsMoney;
|
|
using BlueLaminate.EFCore.Data;
|
|
using Microsoft.EntityFrameworkCore;
|
|
|
|
namespace BlueLaminate.C2;
|
|
|
|
/// <summary>
|
|
/// Hands out scrape jobs to workers, one skin+wear at a time, driven directly by the
|
|
/// catalogue's per-band checkpoints (<c>SkinCondition.ListingsSweptAt</c>) rather than
|
|
/// a pre-built queue. Each claim picks the stalest band (never-swept first), leases it
|
|
/// in memory so two workers can't get the same one, and builds a free-text search. On
|
|
/// completion the ingest stamps <c>ListingsSweptAt</c>, so the band drops to the back —
|
|
/// the sweep loops the whole catalogue continuously and resumes cleanly after restarts.
|
|
/// </summary>
|
|
public sealed class JobQueue
|
|
{
|
|
// A leased condition can't be re-handed-out until released or the lease expires
|
|
// (so a crashed worker's band returns to the pool instead of stalling forever).
|
|
private static readonly TimeSpan LeaseTtl = TimeSpan.FromMinutes(15);
|
|
private const int CandidateBatch = 100;
|
|
|
|
private readonly SemaphoreSlim _gate = new(1, 1);
|
|
private readonly ConcurrentDictionary<int, DateTimeOffset> _leases = new(); // conditionId -> leasedAt
|
|
private readonly ConcurrentDictionary<string, JobMapping> _inFlight = new(); // jobId -> mapping
|
|
|
|
public async Task<ScrapeJobDto?> ClaimNextAsync(SkinTrackerDbContext db, int maxPages, CancellationToken ct)
|
|
{
|
|
await _gate.WaitAsync(ct);
|
|
try
|
|
{
|
|
// Reclaim expired leases first.
|
|
var cutoff = DateTimeOffset.UtcNow - LeaseTtl;
|
|
foreach (var (cid, at) in _leases)
|
|
{
|
|
if (at < cutoff)
|
|
{
|
|
_leases.TryRemove(cid, out _);
|
|
}
|
|
}
|
|
|
|
// Stalest bands first (never-swept null sorts before any timestamp).
|
|
var candidates = await db.SkinConditions
|
|
.OrderBy(c => c.ListingsSweptAt.HasValue)
|
|
.ThenBy(c => c.ListingsSweptAt)
|
|
.Select(c => new Candidate(
|
|
c.Id, c.SkinId, c.Skin.Weapon.Name, c.Skin.Name, c.Condition))
|
|
.Take(CandidateBatch)
|
|
.ToListAsync(ct);
|
|
|
|
var pick = candidates.FirstOrDefault(c => !_leases.ContainsKey(c.ConditionId));
|
|
if (pick is null)
|
|
{
|
|
return null; // everything in the stalest batch is already in flight
|
|
}
|
|
|
|
_leases[pick.ConditionId] = DateTimeOffset.UtcNow;
|
|
var jobId = Guid.NewGuid().ToString("N");
|
|
_inFlight[jobId] = new JobMapping(pick.SkinId, pick.ConditionId);
|
|
|
|
var code = Wear.ToCode(pick.Condition) ?? pick.Condition;
|
|
var search = $"{pick.Weapon} {pick.SkinName} {code}".Trim();
|
|
return new ScrapeJobDto(jobId, pick.SkinId, pick.ConditionId, search, maxPages);
|
|
}
|
|
finally
|
|
{
|
|
_gate.Release();
|
|
}
|
|
}
|
|
|
|
/// <summary>Resolve a posted job to its skin+condition and release its lease.</summary>
|
|
public JobMapping? Complete(string jobId)
|
|
{
|
|
if (_inFlight.TryRemove(jobId, out var mapping))
|
|
{
|
|
_leases.TryRemove(mapping.ConditionId, out _);
|
|
return mapping;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
public int InFlight => _inFlight.Count;
|
|
|
|
public sealed record JobMapping(int SkinId, int ConditionId);
|
|
|
|
private sealed record Candidate(int ConditionId, int SkinId, string Weapon, string SkinName, string Condition);
|
|
}
|