JobQueue now skips bands swept within MinResweepHours (config, default 6h) instead of re-scraping the whole catalogue continuously — the dominant cost on the metered residential proxy. Roughly linear savings with no data loss (full pagination retained); 0 disables it. Worker logs the real compressed transferSize per job (what the proxy bills) rather than the ~6.5x-larger decompressed length, so spend is visible. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
112 lines
4.8 KiB
C#
112 lines
4.8 KiB
C#
using System.Collections.Concurrent;
|
||
using BlueLaminate.Core.CsMoney;
|
||
using BlueLaminate.EFCore.Data;
|
||
using Microsoft.EntityFrameworkCore;
|
||
|
||
namespace BlueLaminate.C2;
|
||
|
||
/// <summary>
|
||
/// Hands out scrape jobs to workers, one skin+wear at a time, driven directly by the
|
||
/// catalogue's per-band checkpoints (<c>SkinCondition.ListingsSweptAt</c>) rather than
|
||
/// a pre-built queue. Each claim picks the stalest band (never-swept first), leases it
|
||
/// in memory so two workers can't get the same one, and builds a free-text search. On
|
||
/// completion the ingest stamps <c>ListingsSweptAt</c>, so the band drops to the back —
|
||
/// the sweep loops the whole catalogue continuously and resumes cleanly after restarts.
|
||
/// <para>
|
||
/// A <see cref="_minResweepInterval"/> floor keeps a band from being re-handed-out until
|
||
/// its data is at least that stale. Without it the queue re-scrapes the whole catalogue
|
||
/// as fast as the workers run, which on a metered residential proxy is the dominant cost;
|
||
/// the floor trades a little price-freshness for a roughly linear bandwidth cut (a 6h
|
||
/// floor vs. continuous ≈ 6× less, if a full pass takes ~1h). When every band is fresher
|
||
/// than the floor the queue hands out nothing (workers idle) until one ages past it.
|
||
/// </para>
|
||
/// </summary>
|
||
public sealed class JobQueue
|
||
{
|
||
// A leased condition can't be re-handed-out until released or the lease expires
|
||
// (so a crashed worker's band returns to the pool instead of stalling forever).
|
||
private static readonly TimeSpan LeaseTtl = TimeSpan.FromMinutes(15);
|
||
private const int CandidateBatch = 100;
|
||
|
||
private readonly TimeSpan _minResweepInterval;
|
||
private readonly SemaphoreSlim _gate = new(1, 1);
|
||
private readonly ConcurrentDictionary<int, DateTimeOffset> _leases = new(); // conditionId -> leasedAt
|
||
private readonly ConcurrentDictionary<string, JobMapping> _inFlight = new(); // jobId -> mapping
|
||
|
||
/// <param name="minResweepInterval">
|
||
/// How stale a band's <c>ListingsSweptAt</c> must be before it's eligible again.
|
||
/// <see cref="TimeSpan.Zero"/> disables the floor (continuous re-sweep).
|
||
/// </param>
|
||
public JobQueue(TimeSpan minResweepInterval)
|
||
{
|
||
_minResweepInterval = minResweepInterval;
|
||
}
|
||
|
||
public async Task<ScrapeJobDto?> ClaimNextAsync(SkinTrackerDbContext db, int maxPages, CancellationToken ct)
|
||
{
|
||
await _gate.WaitAsync(ct);
|
||
try
|
||
{
|
||
// Reclaim expired leases first.
|
||
var cutoff = DateTimeOffset.UtcNow - LeaseTtl;
|
||
foreach (var (cid, at) in _leases)
|
||
{
|
||
if (at < cutoff)
|
||
{
|
||
_leases.TryRemove(cid, out _);
|
||
}
|
||
}
|
||
|
||
// Only consider bands that are never-swept or stale past the re-sweep floor,
|
||
// then stalest first (never-swept null sorts before any timestamp). With the
|
||
// floor in place a fully-fresh catalogue yields no candidates, so workers idle
|
||
// instead of needlessly re-pulling ~1MB pages on the metered proxy.
|
||
var freshCutoff = DateTimeOffset.UtcNow - _minResweepInterval;
|
||
var candidates = await db.SkinConditions
|
||
.Where(c => c.ListingsSweptAt == null || c.ListingsSweptAt <= freshCutoff)
|
||
.OrderBy(c => c.ListingsSweptAt.HasValue)
|
||
.ThenBy(c => c.ListingsSweptAt)
|
||
.Select(c => new Candidate(
|
||
c.Id, c.SkinId, c.Skin.Weapon.Name, c.Skin.Name, c.Condition))
|
||
.Take(CandidateBatch)
|
||
.ToListAsync(ct);
|
||
|
||
var pick = candidates.FirstOrDefault(c => !_leases.ContainsKey(c.ConditionId));
|
||
if (pick is null)
|
||
{
|
||
return null; // everything in the stalest batch is already in flight
|
||
}
|
||
|
||
_leases[pick.ConditionId] = DateTimeOffset.UtcNow;
|
||
var jobId = Guid.NewGuid().ToString("N");
|
||
_inFlight[jobId] = new JobMapping(pick.SkinId, pick.ConditionId);
|
||
|
||
var code = Wear.ToCode(pick.Condition) ?? pick.Condition;
|
||
var search = $"{pick.Weapon} {pick.SkinName} {code}".Trim();
|
||
return new ScrapeJobDto(jobId, pick.SkinId, pick.ConditionId, search, maxPages);
|
||
}
|
||
finally
|
||
{
|
||
_gate.Release();
|
||
}
|
||
}
|
||
|
||
/// <summary>Resolve a posted job to its skin+condition and release its lease.</summary>
|
||
public JobMapping? Complete(string jobId)
|
||
{
|
||
if (_inFlight.TryRemove(jobId, out var mapping))
|
||
{
|
||
_leases.TryRemove(mapping.ConditionId, out _);
|
||
return mapping;
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
public int InFlight => _inFlight.Count;
|
||
|
||
public sealed record JobMapping(int SkinId, int ConditionId);
|
||
|
||
private sealed record Candidate(int ConditionId, int SkinId, string Weapon, string SkinName, string Condition);
|
||
}
|