almost ready
This commit is contained in:
@@ -1,5 +1,4 @@
|
||||
using System.Collections.Concurrent;
|
||||
using BlueLaminate.Core.CsMoney;
|
||||
using BlueLaminate.EFCore.Data;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
|
||||
@@ -7,42 +6,58 @@ namespace BlueLaminate.C2;
|
||||
|
||||
/// <summary>
|
||||
/// Hands out scrape jobs to workers, one skin+wear at a time, driven directly by the
|
||||
/// catalogue's per-band checkpoints (<c>SkinCondition.ListingsSweptAt</c>) rather than
|
||||
/// a pre-built queue. Each claim picks the stalest band (never-swept first), leases it
|
||||
/// in memory so two workers can't get the same one, and builds a free-text search. On
|
||||
/// completion the ingest stamps <c>ListingsSweptAt</c>, so the band drops to the back —
|
||||
/// the sweep loops the whole catalogue continuously and resumes cleanly after restarts.
|
||||
/// catalogue's per-band, per-site checkpoints (the rows in <c>skin_condition_sweeps</c>
|
||||
/// for this queue's <see cref="_source"/>) rather than a pre-built queue. Each claim picks
|
||||
/// the stalest band (never-swept first), leases it in memory so two workers can't get the
|
||||
/// same one, and builds the work target. On completion the ingest stamps the band's
|
||||
/// checkpoint, so it drops to the back — the sweep loops the whole catalogue continuously
|
||||
/// and resumes cleanly after restarts. Because the checkpoint is per-site, a band one
|
||||
/// market just swept is still due on another.
|
||||
/// <para>
|
||||
/// The queue is source-agnostic: it's constructed with the checkpoint
|
||||
/// <see cref="_source"/> and a <see cref="_targetBuilder"/> that turns a band into the
|
||||
/// thing a worker needs — a free-text search for cs.money, a market URL for skin.land — so
|
||||
/// one class drives every market. Register one instance per source.
|
||||
/// </para>
|
||||
/// <para>
|
||||
/// A <see cref="_minResweepInterval"/> floor keeps a band from being re-handed-out until
|
||||
/// its data is at least that stale. Without it the queue re-scrapes the whole catalogue
|
||||
/// as fast as the workers run, which on a metered residential proxy is the dominant cost;
|
||||
/// the floor trades a little price-freshness for a roughly linear bandwidth cut (a 6h
|
||||
/// floor vs. continuous ≈ 6× less, if a full pass takes ~1h). When every band is fresher
|
||||
/// than the floor the queue hands out nothing (workers idle) until one ages past it.
|
||||
/// its data is at least that stale. Without it the queue re-scrapes the whole catalogue as
|
||||
/// fast as the workers run, which on a metered residential proxy is the dominant cost; the
|
||||
/// floor trades a little price-freshness for a roughly linear bandwidth cut. When every
|
||||
/// band is fresher than the floor the queue hands out nothing (workers idle) until one ages.
|
||||
/// </para>
|
||||
/// </summary>
|
||||
public sealed class JobQueue
|
||||
{
|
||||
// A leased condition can't be re-handed-out until released or the lease expires
|
||||
// (so a crashed worker's band returns to the pool instead of stalling forever).
|
||||
// A leased condition can't be re-handed-out until released or the lease expires (so a
|
||||
// crashed worker's band returns to the pool instead of stalling forever).
|
||||
private static readonly TimeSpan LeaseTtl = TimeSpan.FromMinutes(15);
|
||||
private const int CandidateBatch = 100;
|
||||
|
||||
private readonly string _source;
|
||||
private readonly TimeSpan _minResweepInterval;
|
||||
private readonly Func<Candidate, string> _targetBuilder;
|
||||
private readonly SemaphoreSlim _gate = new(1, 1);
|
||||
private readonly ConcurrentDictionary<int, DateTimeOffset> _leases = new(); // conditionId -> leasedAt
|
||||
private readonly ConcurrentDictionary<string, JobMapping> _inFlight = new(); // jobId -> mapping
|
||||
|
||||
/// <param name="source">
|
||||
/// The <c>skin_condition_sweeps.Source</c> this queue reads/leases on (a
|
||||
/// <c>SweepSource</c> value, e.g. "csmoney" / "skinland").
|
||||
/// </param>
|
||||
/// <param name="minResweepInterval">
|
||||
/// How stale a band's <c>ListingsSweptAt</c> must be before it's eligible again.
|
||||
/// How stale a band's checkpoint must be before it's eligible again.
|
||||
/// <see cref="TimeSpan.Zero"/> disables the floor (continuous re-sweep).
|
||||
/// </param>
|
||||
public JobQueue(TimeSpan minResweepInterval)
|
||||
/// <param name="targetBuilder">Turns a claimed band into the worker's target string.</param>
|
||||
public JobQueue(string source, TimeSpan minResweepInterval, Func<Candidate, string> targetBuilder)
|
||||
{
|
||||
_source = source;
|
||||
_minResweepInterval = minResweepInterval;
|
||||
_targetBuilder = targetBuilder;
|
||||
}
|
||||
|
||||
public async Task<ScrapeJobDto?> ClaimNextAsync(SkinTrackerDbContext db, int maxPages, CancellationToken ct)
|
||||
public async Task<ClaimedJob?> ClaimNextAsync(SkinTrackerDbContext db, int maxPages, CancellationToken ct)
|
||||
{
|
||||
await _gate.WaitAsync(ct);
|
||||
try
|
||||
@@ -58,17 +73,26 @@ public sealed class JobQueue
|
||||
}
|
||||
|
||||
// Only consider bands that are never-swept or stale past the re-sweep floor,
|
||||
// then stalest first (never-swept null sorts before any timestamp). With the
|
||||
// floor in place a fully-fresh catalogue yields no candidates, so workers idle
|
||||
// instead of needlessly re-pulling ~1MB pages on the metered proxy.
|
||||
// then stalest first (never-swept null sorts before any timestamp). The
|
||||
// checkpoint is read for THIS queue's source only (a correlated subquery over
|
||||
// the per-site sweep rows), so a band another market just swept is still
|
||||
// never-swept here. With the floor in place a fully-fresh catalogue yields no
|
||||
// candidates, so workers idle instead of needlessly re-pulling on the proxy.
|
||||
var freshCutoff = DateTimeOffset.UtcNow - _minResweepInterval;
|
||||
var candidates = await db.SkinConditions
|
||||
.Where(c => c.ListingsSweptAt == null || c.ListingsSweptAt <= freshCutoff)
|
||||
.OrderBy(c => c.ListingsSweptAt.HasValue)
|
||||
.ThenBy(c => c.ListingsSweptAt)
|
||||
.Select(c => new Candidate(
|
||||
c.Id, c.SkinId, c.Skin.Weapon.Name, c.Skin.Name, c.Condition))
|
||||
.Select(c => new
|
||||
{
|
||||
Candidate = new Candidate(c.Id, c.SkinId, c.Skin.Weapon.Name, c.Skin.Name, c.Condition),
|
||||
SweptAt = c.Sweeps
|
||||
.Where(s => s.Source == _source)
|
||||
.Select(s => (DateTimeOffset?)s.SweptAt)
|
||||
.FirstOrDefault(),
|
||||
})
|
||||
.Where(x => x.SweptAt == null || x.SweptAt <= freshCutoff)
|
||||
.OrderBy(x => x.SweptAt.HasValue)
|
||||
.ThenBy(x => x.SweptAt)
|
||||
.Take(CandidateBatch)
|
||||
.Select(x => x.Candidate)
|
||||
.ToListAsync(ct);
|
||||
|
||||
var pick = candidates.FirstOrDefault(c => !_leases.ContainsKey(c.ConditionId));
|
||||
@@ -81,9 +105,7 @@ public sealed class JobQueue
|
||||
var jobId = Guid.NewGuid().ToString("N");
|
||||
_inFlight[jobId] = new JobMapping(pick.SkinId, pick.ConditionId);
|
||||
|
||||
var code = Wear.ToCode(pick.Condition) ?? pick.Condition;
|
||||
var search = $"{pick.Weapon} {pick.SkinName} {code}".Trim();
|
||||
return new ScrapeJobDto(jobId, pick.SkinId, pick.ConditionId, search, maxPages);
|
||||
return new ClaimedJob(jobId, pick.SkinId, pick.ConditionId, _targetBuilder(pick), maxPages);
|
||||
}
|
||||
finally
|
||||
{
|
||||
@@ -107,5 +129,8 @@ public sealed class JobQueue
|
||||
|
||||
public sealed record JobMapping(int SkinId, int ConditionId);
|
||||
|
||||
private sealed record Candidate(int ConditionId, int SkinId, string Weapon, string SkinName, string Condition);
|
||||
/// <summary>A claimed band ready to hand to a worker: its ids + built target string.</summary>
|
||||
public sealed record ClaimedJob(string JobId, int SkinId, int ConditionId, string Target, int MaxPages);
|
||||
|
||||
public sealed record Candidate(int ConditionId, int SkinId, string Weapon, string SkinName, string Condition);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user