Add init weapon scraper

2026-05-29 14:00:58 -05:00
parent 286d1366fe
commit 6f3c0175cd
20 changed files with 1199 additions and 62 deletions
--- a/BlueLaminate/BlueLaminate.Scraper/BlueLaminate.Scraper.csproj
+++ b/BlueLaminate/BlueLaminate.Scraper/BlueLaminate.Scraper.csproj
@@ -0,0 +1,13 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>net10.0</TargetFramework>
+    <ImplicitUsings>enable</ImplicitUsings>
+    <Nullable>enable</Nullable>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="HtmlAgilityPack" Version="1.12.4" />
+  </ItemGroup>
+
+</Project>
--- a/BlueLaminate/BlueLaminate.Scraper/Weapons/ScrapedWeapon.cs
+++ b/BlueLaminate/BlueLaminate.Scraper/Weapons/ScrapedWeapon.cs
@@ -0,0 +1,7 @@
+namespace BlueLaminate.Scraper.Weapons;
+
+/// <summary>A single CS2 weapon parsed from the Counter-Strike wiki.</summary>
+/// <param name="Name">Display name, e.g. "AK-47".</param>
+/// <param name="Type">Category from the wiki heading, e.g. "Pistols", "Assault Rifles".</param>
+/// <param name="Team">"CT", "T", or "Both".</param>
+public sealed record ScrapedWeapon(string Name, string Type, string Team);
--- a/BlueLaminate/BlueLaminate.Scraper/Weapons/WeaponWikiScraper.cs
+++ b/BlueLaminate/BlueLaminate.Scraper/Weapons/WeaponWikiScraper.cs
@@ -0,0 +1,172 @@
+using System.Text.RegularExpressions;
+using BlueLaminate.Scraper.Wiki;
+using HtmlAgilityPack;
+
+namespace BlueLaminate.Scraper.Weapons;
+
+/// <summary>
+/// Scrapes the CS2 weapon catalogue from the wiki's "Weapons" page.
+///
+/// Layout: the page has one "tabber" per weapon category, each immediately
+/// preceded by a section heading (the category / Type). Inside each tabber the
+/// "Global Offensive &amp; Counter-Strike 2" tab holds a gallery of captions —
+/// one per weapon, optionally suffixed with "(CT)" or "(T)" for team-locked
+/// weapons.
+/// </summary>
+public sealed class WeaponWikiScraper
+{
+    private const string Page = "Weapons";
+    private const string Cs2TabHash = "Global_Offensive_&_Counter-Strike_2";
+
+    // Matches a trailing "(CT)" / "(T)" team annotation, capturing the team.
+    private static readonly Regex TeamAnnotation =
+        new(@"\s*\((CT|T)\)\s*$", RegexOptions.Compiled);
+
+    // The wiki labels the default knife "Stock Knife"; drop the prefix.
+    private static readonly Regex StockPrefix =
+        new(@"^Stock\s+", RegexOptions.Compiled);
+
+    private readonly WikiPageFetcher _fetcher;
+
+    public WeaponWikiScraper(WikiPageFetcher fetcher) => _fetcher = fetcher;
+
+    public async Task<IReadOnlyList<ScrapedWeapon>> ScrapeAsync(CancellationToken ct = default)
+    {
+        var doc = await _fetcher.LoadAsync(Page, ct);
+
+        // Headings and tabbers in document order so each tabber inherits the
+        // most recent heading as its category.
+        var nodes = doc.DocumentNode.SelectNodes(
+            "//h2 | //h3 | //h4 | "
+            + "//div[contains(concat(' ', normalize-space(@class), ' '), ' tabber ')]");
+
+        var aggregator = new WeaponAggregator();
+        string? currentType = null;
+
+        if (nodes is not null)
+        {
+            foreach (var node in nodes)
+            {
+                if (node.Name is "h2" or "h3" or "h4")
+                {
+                    currentType = HeadingText(node);
+                    continue;
+                }
+
+                if (currentType is null)
+                    continue;
+
+                foreach (var caption in Cs2Captions(node))
+                    aggregator.Add(caption, currentType);
+            }
+        }
+
+        return aggregator.Build();
+    }
+
+    /// <summary>Caption texts from the CS2 tab of a single tabber, if present.</summary>
+    private static IEnumerable<string> Cs2Captions(HtmlNode tabber)
+    {
+        var tabs = tabber.SelectNodes(
+            ".//li[contains(concat(' ', normalize-space(@class), ' '), ' wds-tabs__tab ')]");
+        if (tabs is null)
+            yield break;
+
+        var index = -1;
+        for (var i = 0; i < tabs.Count; i++)
+        {
+            // HtmlAgilityPack returns attribute values un-decoded, and the wiki
+            // entity-encodes the "&" in this hash (&amp;).
+            var hash = HtmlEntity.DeEntitize(tabs[i].GetAttributeValue("data-hash", string.Empty));
+            if (hash == Cs2TabHash)
+            {
+                index = i;
+                break;
+            }
+        }
+
+        if (index < 0)
+            yield break;
+
+        var contents = tabber.SelectNodes(
+            ".//div[contains(concat(' ', normalize-space(@class), ' '), ' wds-tab__content ')]");
+        if (contents is null || index >= contents.Count)
+            yield break;
+
+        var captions = contents[index].SelectNodes(
+            ".//div[contains(concat(' ', normalize-space(@class), ' '), ' lightbox-caption ')]");
+        if (captions is null)
+            yield break;
+
+        foreach (var caption in captions)
+            yield return WikiText.Normalize(caption.InnerText);
+    }
+
+    private static string HeadingText(HtmlNode heading)
+    {
+        var headline = heading.SelectSingleNode(
+            ".//span[contains(concat(' ', normalize-space(@class), ' '), ' mw-headline ')]");
+        return WikiText.Normalize((headline ?? heading).InnerText);
+    }
+
+    /// <summary>
+    /// Collapses the per-caption rows into one weapon per name, tracking which
+    /// teams it appeared for so a weapon shown as both "(CT)" and "(T)" (or with
+    /// no annotation) resolves to "Both".
+    /// </summary>
+    private sealed class WeaponAggregator
+    {
+        private sealed class Entry
+        {
+            public required string Type { get; init; }
+            public bool SawCt;
+            public bool SawT;
+            public bool SawUnannotated;
+        }
+
+        private readonly Dictionary<string, Entry> _byName = new();
+        private readonly List<string> _order = new();
+
+        public void Add(string caption, string type)
+        {
+            if (string.IsNullOrEmpty(caption))
+                return;
+
+            var match = TeamAnnotation.Match(caption);
+            var name = TeamAnnotation.Replace(caption, string.Empty);
+            name = StockPrefix.Replace(name, string.Empty).Trim();
+            if (name.Length == 0)
+                return;
+
+            if (!_byName.TryGetValue(name, out var entry))
+            {
+                entry = new Entry { Type = type };
+                _byName[name] = entry;
+                _order.Add(name);
+            }
+
+            if (!match.Success)
+                entry.SawUnannotated = true;
+            else if (match.Groups[1].Value == "CT")
+                entry.SawCt = true;
+            else
+                entry.SawT = true;
+        }
+
+        public IReadOnlyList<ScrapedWeapon> Build()
+        {
+            var result = new List<ScrapedWeapon>(_order.Count);
+            foreach (var name in _order)
+            {
+                var e = _byName[name];
+                var team =
+                    e.SawUnannotated || (e.SawCt && e.SawT) ? "Both"
+                    : e.SawCt ? "CT"
+                    : e.SawT ? "T"
+                    : "Both";
+                result.Add(new ScrapedWeapon(name, e.Type, team));
+            }
+            return result;
+        }
+    }
+}
--- a/BlueLaminate/BlueLaminate.Scraper/Wiki/WikiPageFetcher.cs
+++ b/BlueLaminate/BlueLaminate.Scraper/Wiki/WikiPageFetcher.cs
@@ -0,0 +1,51 @@
+using System.Text.Json;
+using HtmlAgilityPack;
+
+namespace BlueLaminate.Scraper.Wiki;
+
+/// <summary>
+/// Fetches a rendered page from the Counter-Strike Fandom wiki, shared by all
+/// wiki scrapers.
+///
+/// The rendered HTML pages sit behind Cloudflare, which 403s .NET's TLS
+/// fingerprint regardless of headers. The MediaWiki <c>action=parse</c> API is
+/// not challenged, so we fetch the same content as JSON from there and return
+/// the embedded HTML as a parsed document.
+/// </summary>
+public sealed class WikiPageFetcher
+{
+    private const string ApiBase = "https://counterstrike.fandom.com/api.php";
+
+    private readonly HttpClient _http;
+
+    public WikiPageFetcher(HttpClient http) => _http = http;
+
+    /// <summary>Loads a wiki page (e.g. "Weapons") as a parsed HTML document.</summary>
+    public async Task<HtmlDocument> LoadAsync(string page, CancellationToken ct = default)
+    {
+        var url = $"{ApiBase}?action=parse&page={Uri.EscapeDataString(page)}&prop=text&format=json";
+
+        using var resp = await _http.GetAsync(url, ct);
+        resp.EnsureSuccessStatusCode();
+
+        await using var stream = await resp.Content.ReadAsStreamAsync(ct);
+        using var json = await JsonDocument.ParseAsync(stream, cancellationToken: ct);
+
+        if (json.RootElement.TryGetProperty("error", out var error))
+        {
+            var info = error.TryGetProperty("info", out var i) ? i.GetString() : "unknown error";
+            throw new InvalidOperationException($"Wiki API returned an error for page '{page}': {info}");
+        }
+
+        var html = json.RootElement
+            .GetProperty("parse")
+            .GetProperty("text")
+            .GetProperty("*")
+            .GetString()
+            ?? throw new InvalidOperationException($"Wiki API response for page '{page}' had no parsed text.");
+
+        var doc = new HtmlDocument();
+        doc.LoadHtml(html);
+        return doc;
+    }
+}
--- a/BlueLaminate/BlueLaminate.Scraper/Wiki/WikiText.cs
+++ b/BlueLaminate/BlueLaminate.Scraper/Wiki/WikiText.cs
@@ -0,0 +1,14 @@
+using System.Text.RegularExpressions;
+using HtmlAgilityPack;
+
+namespace BlueLaminate.Scraper.Wiki;
+
+/// <summary>Text helpers shared by wiki scrapers.</summary>
+public static class WikiText
+{
+    private static readonly Regex Whitespace = new(@"\s+", RegexOptions.Compiled);
+
+    /// <summary>Decodes HTML entities and collapses whitespace runs to single spaces.</summary>
+    public static string Normalize(string raw) =>
+        Whitespace.Replace(HtmlEntity.DeEntitize(raw) ?? string.Empty, " ").Trim();
+}