Cut metered-proxy bandwidth: re-sweep floor + wire-size logging
JobQueue now skips bands swept within MinResweepHours (config, default 6h) instead of re-scraping the whole catalogue continuously — the dominant cost on the metered residential proxy. Roughly linear savings with no data loss (full pagination retained); 0 disables it. Worker logs the real compressed transferSize per job (what the proxy bills) rather than the ~6.5x-larger decompressed length, so spend is visible. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -12,6 +12,14 @@ namespace BlueLaminate.C2;
|
||||
/// in memory so two workers can't get the same one, and builds a free-text search. On
|
||||
/// completion the ingest stamps <c>ListingsSweptAt</c>, so the band drops to the back —
|
||||
/// the sweep loops the whole catalogue continuously and resumes cleanly after restarts.
|
||||
/// <para>
|
||||
/// A <see cref="_minResweepInterval"/> floor keeps a band from being re-handed-out until
|
||||
/// its data is at least that stale. Without it the queue re-scrapes the whole catalogue
|
||||
/// as fast as the workers run, which on a metered residential proxy is the dominant cost;
|
||||
/// the floor trades a little price-freshness for a roughly linear bandwidth cut (a 6h
|
||||
/// floor vs. continuous ≈ 6× less, if a full pass takes ~1h). When every band is fresher
|
||||
/// than the floor the queue hands out nothing (workers idle) until one ages past it.
|
||||
/// </para>
|
||||
/// </summary>
|
||||
public sealed class JobQueue
|
||||
{
|
||||
@@ -20,10 +28,20 @@ public sealed class JobQueue
|
||||
private static readonly TimeSpan LeaseTtl = TimeSpan.FromMinutes(15);
|
||||
private const int CandidateBatch = 100;
|
||||
|
||||
private readonly TimeSpan _minResweepInterval;
|
||||
private readonly SemaphoreSlim _gate = new(1, 1);
|
||||
private readonly ConcurrentDictionary<int, DateTimeOffset> _leases = new(); // conditionId -> leasedAt
|
||||
private readonly ConcurrentDictionary<string, JobMapping> _inFlight = new(); // jobId -> mapping
|
||||
|
||||
/// <param name="minResweepInterval">
|
||||
/// How stale a band's <c>ListingsSweptAt</c> must be before it's eligible again.
|
||||
/// <see cref="TimeSpan.Zero"/> disables the floor (continuous re-sweep).
|
||||
/// </param>
|
||||
public JobQueue(TimeSpan minResweepInterval)
|
||||
{
|
||||
_minResweepInterval = minResweepInterval;
|
||||
}
|
||||
|
||||
public async Task<ScrapeJobDto?> ClaimNextAsync(SkinTrackerDbContext db, int maxPages, CancellationToken ct)
|
||||
{
|
||||
await _gate.WaitAsync(ct);
|
||||
@@ -39,8 +57,13 @@ public sealed class JobQueue
|
||||
}
|
||||
}
|
||||
|
||||
// Stalest bands first (never-swept null sorts before any timestamp).
|
||||
// Only consider bands that are never-swept or stale past the re-sweep floor,
|
||||
// then stalest first (never-swept null sorts before any timestamp). With the
|
||||
// floor in place a fully-fresh catalogue yields no candidates, so workers idle
|
||||
// instead of needlessly re-pulling ~1MB pages on the metered proxy.
|
||||
var freshCutoff = DateTimeOffset.UtcNow - _minResweepInterval;
|
||||
var candidates = await db.SkinConditions
|
||||
.Where(c => c.ListingsSweptAt == null || c.ListingsSweptAt <= freshCutoff)
|
||||
.OrderBy(c => c.ListingsSweptAt.HasValue)
|
||||
.ThenBy(c => c.ListingsSweptAt)
|
||||
.Select(c => new Candidate(
|
||||
|
||||
@@ -14,7 +14,12 @@ var builder = WebApplication.CreateBuilder(new WebApplicationOptions
|
||||
ContentRootPath = AppContext.BaseDirectory,
|
||||
});
|
||||
builder.Services.AddBlueLaminateCore(builder.Configuration);
|
||||
builder.Services.AddSingleton<JobQueue>();
|
||||
|
||||
// Re-sweep floor: don't re-hand-out a band whose listings were swept less than this
|
||||
// many hours ago. The dominant cost on the metered residential proxy is re-scraping
|
||||
// already-fresh bands, so this caps how often any band is re-pulled. 0 = continuous.
|
||||
var minResweepHours = builder.Configuration.GetValue("MinResweepHours", 6.0);
|
||||
builder.Services.AddSingleton(new JobQueue(TimeSpan.FromHours(minResweepHours)));
|
||||
|
||||
var app = builder.Build();
|
||||
|
||||
|
||||
@@ -12,5 +12,6 @@
|
||||
"SkinTracker": "Host=localhost;Port=5432;Database=skintracker;Username=postgres"
|
||||
},
|
||||
"WorkerToken": "dev-worker-token",
|
||||
"MaxPagesPerJob": 60
|
||||
"MaxPagesPerJob": 60,
|
||||
"MinResweepHours": 6
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user