almost ready

This commit is contained in:
bob
2026-06-01 10:52:06 -05:00
parent 8b0eb0db78
commit 763305ca89
94 changed files with 8766 additions and 2674 deletions

View File

@@ -1,4 +1,5 @@
using BlueLaminate.Core.CsMoney;
using BlueLaminate.Core.SkinLand;
namespace BlueLaminate.C2;
@@ -17,3 +18,20 @@ public sealed record ScrapeJobDto(string JobId, int SkinId, int? ConditionId, st
/// <param name="StoppedReason">Why it stopped. "completed" = full sweep (authoritative);
/// anything else (fetch-cap / challenged / stuck-float-tie) is partial.</param>
public sealed record ScrapeResultDto(List<CsMoneyItem> Items, int Pages, string? StoppedReason);
/// <summary>A unit of skin.land scrape work: one skin+wear, as its market page URL.</summary>
/// <param name="JobId">Opaque id the worker echoes back when posting results.</param>
/// <param name="SkinId">Catalogue skin this job targets.</param>
/// <param name="ConditionId">Wear band (skin_conditions row).</param>
/// <param name="Url">The skin.land market page, e.g.
/// "https://skin.land/market/csgo/ak-47-redline-field-tested/". The worker resolves the
/// internal skin_id from this page, then pages the obtained-skins API.</param>
/// <param name="MaxPages">Safety cap on offer-page fetches (Laravel paginator, ~26/page).</param>
public sealed record SkinLandJobDto(string JobId, int SkinId, int ConditionId, string Url, int MaxPages);
/// <summary>A worker's results for a claimed skin.land job: the offers it scraped.</summary>
/// <param name="Items">All obtained-skins offers gathered across pages (raw skin.land shape).</param>
/// <param name="Pages">How many offer pages the worker fetched.</param>
/// <param name="StoppedReason">Why it stopped. "completed" = full sweep (authoritative);
/// anything else (fetch-cap / challenged / no-skin-id) is partial.</param>
public sealed record SkinLandResultDto(List<SkinLandOffer> Items, int Pages, string? StoppedReason);

View File

@@ -1,5 +1,4 @@
using System.Collections.Concurrent;
using BlueLaminate.Core.CsMoney;
using BlueLaminate.EFCore.Data;
using Microsoft.EntityFrameworkCore;
@@ -7,42 +6,58 @@ namespace BlueLaminate.C2;
/// <summary>
/// Hands out scrape jobs to workers, one skin+wear at a time, driven directly by the
/// catalogue's per-band checkpoints (<c>SkinCondition.ListingsSweptAt</c>) rather than
/// a pre-built queue. Each claim picks the stalest band (never-swept first), leases it
/// in memory so two workers can't get the same one, and builds a free-text search. On
/// completion the ingest stamps <c>ListingsSweptAt</c>, so the band drops to the back —
/// the sweep loops the whole catalogue continuously and resumes cleanly after restarts.
/// catalogue's per-band, per-site checkpoints (the rows in <c>skin_condition_sweeps</c>
/// for this queue's <see cref="_source"/>) rather than a pre-built queue. Each claim picks
/// the stalest band (never-swept first), leases it in memory so two workers can't get the
/// same one, and builds the work target. On completion the ingest stamps the band's
/// checkpoint, so it drops to the back — the sweep loops the whole catalogue continuously
/// and resumes cleanly after restarts. Because the checkpoint is per-site, a band one
/// market just swept is still due on another.
/// <para>
/// The queue is source-agnostic: it's constructed with the checkpoint
/// <see cref="_source"/> and a <see cref="_targetBuilder"/> that turns a band into the
/// thing a worker needs — a free-text search for cs.money, a market URL for skin.land — so
/// one class drives every market. Register one instance per source.
/// </para>
/// <para>
/// A <see cref="_minResweepInterval"/> floor keeps a band from being re-handed-out until
/// its data is at least that stale. Without it the queue re-scrapes the whole catalogue
/// as fast as the workers run, which on a metered residential proxy is the dominant cost;
/// the floor trades a little price-freshness for a roughly linear bandwidth cut (a 6h
/// floor vs. continuous ≈ 6× less, if a full pass takes ~1h). When every band is fresher
/// than the floor the queue hands out nothing (workers idle) until one ages past it.
/// its data is at least that stale. Without it the queue re-scrapes the whole catalogue as
/// fast as the workers run, which on a metered residential proxy is the dominant cost; the
/// floor trades a little price-freshness for a roughly linear bandwidth cut. When every
/// band is fresher than the floor the queue hands out nothing (workers idle) until one ages.
/// </para>
/// </summary>
public sealed class JobQueue
{
// A leased condition can't be re-handed-out until released or the lease expires
// (so a crashed worker's band returns to the pool instead of stalling forever).
// A leased condition can't be re-handed-out until released or the lease expires (so a
// crashed worker's band returns to the pool instead of stalling forever).
private static readonly TimeSpan LeaseTtl = TimeSpan.FromMinutes(15);
private const int CandidateBatch = 100;
private readonly string _source;
private readonly TimeSpan _minResweepInterval;
private readonly Func<Candidate, string> _targetBuilder;
private readonly SemaphoreSlim _gate = new(1, 1);
private readonly ConcurrentDictionary<int, DateTimeOffset> _leases = new(); // conditionId -> leasedAt
private readonly ConcurrentDictionary<string, JobMapping> _inFlight = new(); // jobId -> mapping
/// <param name="source">
/// The <c>skin_condition_sweeps.Source</c> this queue reads/leases on (a
/// <c>SweepSource</c> value, e.g. "csmoney" / "skinland").
/// </param>
/// <param name="minResweepInterval">
/// How stale a band's <c>ListingsSweptAt</c> must be before it's eligible again.
/// How stale a band's checkpoint must be before it's eligible again.
/// <see cref="TimeSpan.Zero"/> disables the floor (continuous re-sweep).
/// </param>
public JobQueue(TimeSpan minResweepInterval)
/// <param name="targetBuilder">Turns a claimed band into the worker's target string.</param>
public JobQueue(string source, TimeSpan minResweepInterval, Func<Candidate, string> targetBuilder)
{
_source = source;
_minResweepInterval = minResweepInterval;
_targetBuilder = targetBuilder;
}
public async Task<ScrapeJobDto?> ClaimNextAsync(SkinTrackerDbContext db, int maxPages, CancellationToken ct)
public async Task<ClaimedJob?> ClaimNextAsync(SkinTrackerDbContext db, int maxPages, CancellationToken ct)
{
await _gate.WaitAsync(ct);
try
@@ -58,17 +73,26 @@ public sealed class JobQueue
}
// Only consider bands that are never-swept or stale past the re-sweep floor,
// then stalest first (never-swept null sorts before any timestamp). With the
// floor in place a fully-fresh catalogue yields no candidates, so workers idle
// instead of needlessly re-pulling ~1MB pages on the metered proxy.
// then stalest first (never-swept null sorts before any timestamp). The
// checkpoint is read for THIS queue's source only (a correlated subquery over
// the per-site sweep rows), so a band another market just swept is still
// never-swept here. With the floor in place a fully-fresh catalogue yields no
// candidates, so workers idle instead of needlessly re-pulling on the proxy.
var freshCutoff = DateTimeOffset.UtcNow - _minResweepInterval;
var candidates = await db.SkinConditions
.Where(c => c.ListingsSweptAt == null || c.ListingsSweptAt <= freshCutoff)
.OrderBy(c => c.ListingsSweptAt.HasValue)
.ThenBy(c => c.ListingsSweptAt)
.Select(c => new Candidate(
c.Id, c.SkinId, c.Skin.Weapon.Name, c.Skin.Name, c.Condition))
.Select(c => new
{
Candidate = new Candidate(c.Id, c.SkinId, c.Skin.Weapon.Name, c.Skin.Name, c.Condition),
SweptAt = c.Sweeps
.Where(s => s.Source == _source)
.Select(s => (DateTimeOffset?)s.SweptAt)
.FirstOrDefault(),
})
.Where(x => x.SweptAt == null || x.SweptAt <= freshCutoff)
.OrderBy(x => x.SweptAt.HasValue)
.ThenBy(x => x.SweptAt)
.Take(CandidateBatch)
.Select(x => x.Candidate)
.ToListAsync(ct);
var pick = candidates.FirstOrDefault(c => !_leases.ContainsKey(c.ConditionId));
@@ -81,9 +105,7 @@ public sealed class JobQueue
var jobId = Guid.NewGuid().ToString("N");
_inFlight[jobId] = new JobMapping(pick.SkinId, pick.ConditionId);
var code = Wear.ToCode(pick.Condition) ?? pick.Condition;
var search = $"{pick.Weapon} {pick.SkinName} {code}".Trim();
return new ScrapeJobDto(jobId, pick.SkinId, pick.ConditionId, search, maxPages);
return new ClaimedJob(jobId, pick.SkinId, pick.ConditionId, _targetBuilder(pick), maxPages);
}
finally
{
@@ -107,5 +129,8 @@ public sealed class JobQueue
public sealed record JobMapping(int SkinId, int ConditionId);
private sealed record Candidate(int ConditionId, int SkinId, string Weapon, string SkinName, string Condition);
/// <summary>A claimed band ready to hand to a worker: its ids + built target string.</summary>
public sealed record ClaimedJob(string JobId, int SkinId, int ConditionId, string Target, int MaxPages);
public sealed record Candidate(int ConditionId, int SkinId, string Weapon, string SkinName, string Condition);
}

View File

@@ -1,13 +1,16 @@
using BlueLaminate.C2;
using BlueLaminate.Core.CsMoney;
using BlueLaminate.Core.DependencyInjection;
using BlueLaminate.Core.SkinLand;
using System.Text.Json.Serialization;
using BlueLaminate.EFCore.Data;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.DependencyInjection;
// The C2: hands cs.money scrape jobs to Python workers and ingests their results.
// Reuses the whole BlueLaminate stack (DB, ingest service) via the one composition root.
// Content root = the binary directory so appsettings.json is found regardless of the
// working directory the process is launched from (matches the CLI's approach).
// The C2: hands cs.money and skin.land scrape jobs to Python workers and ingests their
// results. Reuses the whole BlueLaminate stack (DB, ingest services) via the one
// composition root. Content root = the binary directory so appsettings.json is found
// regardless of the working directory the process is launched from (matches the CLI).
var builder = WebApplication.CreateBuilder(new WebApplicationOptions
{
Args = args,
@@ -15,17 +18,34 @@ var builder = WebApplication.CreateBuilder(new WebApplicationOptions
});
builder.Services.AddBlueLaminateCore(builder.Configuration);
// Re-sweep floor: don't re-hand-out a band whose listings were swept less than this
// many hours ago. The dominant cost on the metered residential proxy is re-scraping
// already-fresh bands, so this caps how often any band is re-pulled. 0 = continuous.
// Worker result bodies carry some numbers as JSON strings (skin.land's item_float comes
// through as "0.60…"); allow string-encoded numbers so they bind, parsed straight to
// decimal (full precision preserved). Harmless to cs.money's numeric fields.
builder.Services.ConfigureHttpJsonOptions(o =>
o.SerializerOptions.NumberHandling |= JsonNumberHandling.AllowReadingFromString);
// Re-sweep floor: don't re-hand-out a band whose listings were swept less than this many
// hours ago. The dominant cost on the metered residential proxy is re-scraping already-
// fresh bands, so this caps how often any band is re-pulled. 0 = continuous. Shared by
// both markets (each keeps its own per-site checkpoints, so the floors are independent).
var minResweepHours = builder.Configuration.GetValue("MinResweepHours", 6.0);
builder.Services.AddSingleton(new JobQueue(TimeSpan.FromHours(minResweepHours)));
var floor = TimeSpan.FromHours(minResweepHours);
// One JobQueue per market source (same class, different checkpoint source + target). The
// candidate query reads each band's checkpoint for that queue's source only, so the two
// sweeps progress independently over the shared catalogue.
builder.Services.AddKeyedSingleton(CsMoneyIngestService.Source, new JobQueue(
CsMoneyIngestService.Source, floor,
c => $"{c.Weapon} {c.SkinName} {Wear.ToCode(c.Condition) ?? c.Condition}".Trim()));
builder.Services.AddKeyedSingleton(SkinLandIngestService.Source, new JobQueue(
SkinLandIngestService.Source, floor,
c => SkinLandSlug.MarketUrl(c.Weapon, c.SkinName, c.Condition)));
var app = builder.Build();
// Apply pending EF migrations at startup (incl. the market_listings view) so a fresh
// container is ready with one command. Disable with AutoMigrate=false if you'd rather
// run `dotnet ef database update` yourself.
// container is ready with one command. Disable with AutoMigrate=false if you'd rather run
// `dotnet ef database update` yourself.
if (app.Configuration.GetValue("AutoMigrate", true))
{
using var scope = app.Services.CreateScope();
@@ -33,8 +53,8 @@ if (app.Configuration.GetValue("AutoMigrate", true))
db.Database.Migrate();
}
// Shared-secret gate. Workers send it as X-Worker-Token; if no token is configured
// the gate is open (local dev). Set WorkerToken (config) / WORKER_TOKEN (env) in prod.
// Shared-secret gate. Workers send it as X-Worker-Token; if no token is configured the
// gate is open (local dev). Set WorkerToken (config) / WORKER_TOKEN (env) in prod.
var workerToken = builder.Configuration["WorkerToken"];
var maxPagesPerJob = builder.Configuration.GetValue("MaxPagesPerJob", 60);
@@ -49,30 +69,43 @@ app.MapGet("/market/instance/{instanceId:int}", async (
int instanceId, MarketPresenceService presence, CancellationToken ct) =>
Results.Ok(await presence.ForInstanceAsync(instanceId, ct)));
var jobs = app.MapGroup("/jobs");
jobs.AddEndpointFilter(async (ctx, next) =>
// The same X-Worker-Token gate applied to every worker-facing route group.
Func<RouteGroupBuilder, RouteGroupBuilder> withTokenGate = group =>
{
if (!string.IsNullOrEmpty(workerToken)
&& ctx.HttpContext.Request.Headers["X-Worker-Token"].ToString() != workerToken)
group.AddEndpointFilter(async (ctx, next) =>
{
return Results.Unauthorized();
}
if (!string.IsNullOrEmpty(workerToken)
&& ctx.HttpContext.Request.Headers["X-Worker-Token"].ToString() != workerToken)
{
return Results.Unauthorized();
}
return await next(ctx);
});
return await next(ctx);
});
return group;
};
// --- cs.money worker endpoints (unchanged behaviour) ------------------------------------
var jobs = withTokenGate(app.MapGroup("/jobs"));
// Claim the next stalest skin+wear to scrape. 204 when nothing is currently available
// (everything in the stalest batch is already leased to other workers).
jobs.MapGet("/next", async (JobQueue queue, SkinTrackerDbContext db, CancellationToken ct) =>
jobs.MapGet("/next", async (
[FromKeyedServices(CsMoneyIngestService.Source)] JobQueue queue,
SkinTrackerDbContext db, CancellationToken ct) =>
{
var job = await queue.ClaimNextAsync(db, maxPagesPerJob, ct);
return job is null ? Results.NoContent() : Results.Ok(job);
return job is null
? Results.NoContent()
: Results.Ok(new ScrapeJobDto(job.JobId, job.SkinId, job.ConditionId, job.Target, job.MaxPages));
});
// Post a claimed job's scraped listings. The C2 owns parsing/persistence so the
// worker stays dumb: it just forwards the raw cs.money items it gathered.
// Post a claimed job's scraped listings. The C2 owns parsing/persistence so the worker
// stays dumb: it just forwards the raw cs.money items it gathered.
jobs.MapPost("/{jobId}/result", async (
string jobId, ScrapeResultDto result, JobQueue queue, CsMoneyIngestService ingest, CancellationToken ct) =>
string jobId, ScrapeResultDto result,
[FromKeyedServices(CsMoneyIngestService.Source)] JobQueue queue,
CsMoneyIngestService ingest, CancellationToken ct) =>
{
var mapping = queue.Complete(jobId);
if (mapping is null)
@@ -89,4 +122,33 @@ jobs.MapPost("/{jobId}/result", async (
return Results.Ok(r);
});
// --- skin.land worker endpoints ---------------------------------------------------------
var skinLandJobs = withTokenGate(app.MapGroup("/skinland/jobs"));
skinLandJobs.MapGet("/next", async (
[FromKeyedServices(SkinLandIngestService.Source)] JobQueue queue,
SkinTrackerDbContext db, CancellationToken ct) =>
{
var job = await queue.ClaimNextAsync(db, maxPagesPerJob, ct);
return job is null
? Results.NoContent()
: Results.Ok(new SkinLandJobDto(job.JobId, job.SkinId, job.ConditionId, job.Target, job.MaxPages));
});
skinLandJobs.MapPost("/{jobId}/result", async (
string jobId, SkinLandResultDto result,
[FromKeyedServices(SkinLandIngestService.Source)] JobQueue queue,
SkinLandIngestService ingest, CancellationToken ct) =>
{
var mapping = queue.Complete(jobId);
if (mapping is null)
{
return Results.NotFound(new { error = "unknown or expired jobId" });
}
var complete = string.Equals(result.StoppedReason, "completed", StringComparison.OrdinalIgnoreCase);
var r = await ingest.IngestAsync(mapping.SkinId, mapping.ConditionId, result.Items ?? [], complete, ct);
return Results.Ok(r);
});
app.Run();