Add init weapon scraper

This commit is contained in:
bob
2026-05-29 14:00:58 -05:00
parent 286d1366fe
commit 6f3c0175cd
20 changed files with 1199 additions and 62 deletions

View File

@@ -0,0 +1,13 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="HtmlAgilityPack" Version="1.12.4" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,7 @@
namespace BlueLaminate.Scraper.Weapons;
/// <summary>A single CS2 weapon parsed from the Counter-Strike wiki.</summary>
/// <param name="Name">Display name, e.g. "AK-47".</param>
/// <param name="Type">Category from the wiki heading, e.g. "Pistols", "Assault Rifles".</param>
/// <param name="Team">"CT", "T", or "Both".</param>
public sealed record ScrapedWeapon(string Name, string Type, string Team);

View File

@@ -0,0 +1,172 @@
using System.Text.RegularExpressions;
using BlueLaminate.Scraper.Wiki;
using HtmlAgilityPack;
namespace BlueLaminate.Scraper.Weapons;
/// <summary>
/// Scrapes the CS2 weapon catalogue from the wiki's "Weapons" page.
///
/// Layout: the page has one "tabber" per weapon category, each immediately
/// preceded by a section heading (the category / Type). Inside each tabber the
/// "Global Offensive &amp; Counter-Strike 2" tab holds a gallery of captions —
/// one per weapon, optionally suffixed with "(CT)" or "(T)" for team-locked
/// weapons.
/// </summary>
public sealed class WeaponWikiScraper
{
private const string Page = "Weapons";
private const string Cs2TabHash = "Global_Offensive_&_Counter-Strike_2";
// Matches a trailing "(CT)" / "(T)" team annotation, capturing the team.
private static readonly Regex TeamAnnotation =
new(@"\s*\((CT|T)\)\s*$", RegexOptions.Compiled);
// The wiki labels the default knife "Stock Knife"; drop the prefix.
private static readonly Regex StockPrefix =
new(@"^Stock\s+", RegexOptions.Compiled);
private readonly WikiPageFetcher _fetcher;
public WeaponWikiScraper(WikiPageFetcher fetcher) => _fetcher = fetcher;
public async Task<IReadOnlyList<ScrapedWeapon>> ScrapeAsync(CancellationToken ct = default)
{
var doc = await _fetcher.LoadAsync(Page, ct);
// Headings and tabbers in document order so each tabber inherits the
// most recent heading as its category.
var nodes = doc.DocumentNode.SelectNodes(
"//h2 | //h3 | //h4 | "
+ "//div[contains(concat(' ', normalize-space(@class), ' '), ' tabber ')]");
var aggregator = new WeaponAggregator();
string? currentType = null;
if (nodes is not null)
{
foreach (var node in nodes)
{
if (node.Name is "h2" or "h3" or "h4")
{
currentType = HeadingText(node);
continue;
}
if (currentType is null)
continue;
foreach (var caption in Cs2Captions(node))
aggregator.Add(caption, currentType);
}
}
return aggregator.Build();
}
/// <summary>Caption texts from the CS2 tab of a single tabber, if present.</summary>
private static IEnumerable<string> Cs2Captions(HtmlNode tabber)
{
var tabs = tabber.SelectNodes(
".//li[contains(concat(' ', normalize-space(@class), ' '), ' wds-tabs__tab ')]");
if (tabs is null)
yield break;
var index = -1;
for (var i = 0; i < tabs.Count; i++)
{
// HtmlAgilityPack returns attribute values un-decoded, and the wiki
// entity-encodes the "&" in this hash (&amp;).
var hash = HtmlEntity.DeEntitize(tabs[i].GetAttributeValue("data-hash", string.Empty));
if (hash == Cs2TabHash)
{
index = i;
break;
}
}
if (index < 0)
yield break;
var contents = tabber.SelectNodes(
".//div[contains(concat(' ', normalize-space(@class), ' '), ' wds-tab__content ')]");
if (contents is null || index >= contents.Count)
yield break;
var captions = contents[index].SelectNodes(
".//div[contains(concat(' ', normalize-space(@class), ' '), ' lightbox-caption ')]");
if (captions is null)
yield break;
foreach (var caption in captions)
yield return WikiText.Normalize(caption.InnerText);
}
private static string HeadingText(HtmlNode heading)
{
var headline = heading.SelectSingleNode(
".//span[contains(concat(' ', normalize-space(@class), ' '), ' mw-headline ')]");
return WikiText.Normalize((headline ?? heading).InnerText);
}
/// <summary>
/// Collapses the per-caption rows into one weapon per name, tracking which
/// teams it appeared for so a weapon shown as both "(CT)" and "(T)" (or with
/// no annotation) resolves to "Both".
/// </summary>
private sealed class WeaponAggregator
{
private sealed class Entry
{
public required string Type { get; init; }
public bool SawCt;
public bool SawT;
public bool SawUnannotated;
}
private readonly Dictionary<string, Entry> _byName = new();
private readonly List<string> _order = new();
public void Add(string caption, string type)
{
if (string.IsNullOrEmpty(caption))
return;
var match = TeamAnnotation.Match(caption);
var name = TeamAnnotation.Replace(caption, string.Empty);
name = StockPrefix.Replace(name, string.Empty).Trim();
if (name.Length == 0)
return;
if (!_byName.TryGetValue(name, out var entry))
{
entry = new Entry { Type = type };
_byName[name] = entry;
_order.Add(name);
}
if (!match.Success)
entry.SawUnannotated = true;
else if (match.Groups[1].Value == "CT")
entry.SawCt = true;
else
entry.SawT = true;
}
public IReadOnlyList<ScrapedWeapon> Build()
{
var result = new List<ScrapedWeapon>(_order.Count);
foreach (var name in _order)
{
var e = _byName[name];
var team =
e.SawUnannotated || (e.SawCt && e.SawT) ? "Both"
: e.SawCt ? "CT"
: e.SawT ? "T"
: "Both";
result.Add(new ScrapedWeapon(name, e.Type, team));
}
return result;
}
}
}

View File

@@ -0,0 +1,51 @@
using System.Text.Json;
using HtmlAgilityPack;
namespace BlueLaminate.Scraper.Wiki;
/// <summary>
/// Fetches a rendered page from the Counter-Strike Fandom wiki, shared by all
/// wiki scrapers.
///
/// The rendered HTML pages sit behind Cloudflare, which 403s .NET's TLS
/// fingerprint regardless of headers. The MediaWiki <c>action=parse</c> API is
/// not challenged, so we fetch the same content as JSON from there and return
/// the embedded HTML as a parsed document.
/// </summary>
public sealed class WikiPageFetcher
{
private const string ApiBase = "https://counterstrike.fandom.com/api.php";
private readonly HttpClient _http;
public WikiPageFetcher(HttpClient http) => _http = http;
/// <summary>Loads a wiki page (e.g. "Weapons") as a parsed HTML document.</summary>
public async Task<HtmlDocument> LoadAsync(string page, CancellationToken ct = default)
{
var url = $"{ApiBase}?action=parse&page={Uri.EscapeDataString(page)}&prop=text&format=json";
using var resp = await _http.GetAsync(url, ct);
resp.EnsureSuccessStatusCode();
await using var stream = await resp.Content.ReadAsStreamAsync(ct);
using var json = await JsonDocument.ParseAsync(stream, cancellationToken: ct);
if (json.RootElement.TryGetProperty("error", out var error))
{
var info = error.TryGetProperty("info", out var i) ? i.GetString() : "unknown error";
throw new InvalidOperationException($"Wiki API returned an error for page '{page}': {info}");
}
var html = json.RootElement
.GetProperty("parse")
.GetProperty("text")
.GetProperty("*")
.GetString()
?? throw new InvalidOperationException($"Wiki API response for page '{page}' had no parsed text.");
var doc = new HtmlDocument();
doc.LoadHtml(html);
return doc;
}
}

View File

@@ -0,0 +1,14 @@
using System.Text.RegularExpressions;
using HtmlAgilityPack;
namespace BlueLaminate.Scraper.Wiki;
/// <summary>Text helpers shared by wiki scrapers.</summary>
public static class WikiText
{
private static readonly Regex Whitespace = new(@"\s+", RegexOptions.Compiled);
/// <summary>Decodes HTML entities and collapses whitespace runs to single spaces.</summary>
public static string Normalize(string raw) =>
Whitespace.Replace(HtmlEntity.DeEntitize(raw) ?? string.Empty, " ").Trim();
}