using System.Text.RegularExpressions; using BlueLaminate.Scraper.Wiki; using HtmlAgilityPack; namespace BlueLaminate.Scraper.Weapons; /// /// Scrapes the CS2 weapon catalogue from the wiki's "Weapons" page. /// /// Layout: the page has one "tabber" per weapon category, each immediately /// preceded by a section heading (the category / Type). Inside each tabber the /// "Global Offensive & Counter-Strike 2" tab holds a gallery of captions — /// one per weapon, optionally suffixed with "(CT)" or "(T)" for team-locked /// weapons. /// public sealed class WeaponWikiScraper { private const string Page = "Weapons"; private const string Cs2TabHash = "Global_Offensive_&_Counter-Strike_2"; // Matches a trailing "(CT)" / "(T)" team annotation, capturing the team. private static readonly Regex TeamAnnotation = new(@"\s*\((CT|T)\)\s*$", RegexOptions.Compiled); // The wiki labels the default knife "Stock Knife"; drop the prefix. private static readonly Regex StockPrefix = new(@"^Stock\s+", RegexOptions.Compiled); private readonly WikiPageFetcher _fetcher; public WeaponWikiScraper(WikiPageFetcher fetcher) => _fetcher = fetcher; public async Task> ScrapeAsync(CancellationToken ct = default) { var doc = await _fetcher.LoadAsync(Page, ct); // Headings and tabbers in document order so each tabber inherits the // most recent heading as its category. var nodes = doc.DocumentNode.SelectNodes( "//h2 | //h3 | //h4 | " + "//div[contains(concat(' ', normalize-space(@class), ' '), ' tabber ')]"); var aggregator = new WeaponAggregator(); string? currentType = null; if (nodes is not null) { foreach (var node in nodes) { if (node.Name is "h2" or "h3" or "h4") { currentType = HeadingText(node); continue; } if (currentType is null) continue; foreach (var caption in Cs2Captions(node)) aggregator.Add(caption, currentType); } } return aggregator.Build(); } /// Caption texts from the CS2 tab of a single tabber, if present. private static IEnumerable Cs2Captions(HtmlNode tabber) { var tabs = tabber.SelectNodes( ".//li[contains(concat(' ', normalize-space(@class), ' '), ' wds-tabs__tab ')]"); if (tabs is null) yield break; var index = -1; for (var i = 0; i < tabs.Count; i++) { // HtmlAgilityPack returns attribute values un-decoded, and the wiki // entity-encodes the "&" in this hash (&). var hash = HtmlEntity.DeEntitize(tabs[i].GetAttributeValue("data-hash", string.Empty)); if (hash == Cs2TabHash) { index = i; break; } } if (index < 0) yield break; var contents = tabber.SelectNodes( ".//div[contains(concat(' ', normalize-space(@class), ' '), ' wds-tab__content ')]"); if (contents is null || index >= contents.Count) yield break; var captions = contents[index].SelectNodes( ".//div[contains(concat(' ', normalize-space(@class), ' '), ' lightbox-caption ')]"); if (captions is null) yield break; foreach (var caption in captions) yield return WikiText.Normalize(caption.InnerText); } private static string HeadingText(HtmlNode heading) { var headline = heading.SelectSingleNode( ".//span[contains(concat(' ', normalize-space(@class), ' '), ' mw-headline ')]"); return WikiText.Normalize((headline ?? heading).InnerText); } /// /// Collapses the per-caption rows into one weapon per name, tracking which /// teams it appeared for so a weapon shown as both "(CT)" and "(T)" (or with /// no annotation) resolves to "Both". /// private sealed class WeaponAggregator { private sealed class Entry { public required string Type { get; init; } public bool SawCt; public bool SawT; public bool SawUnannotated; } private readonly Dictionary _byName = new(); private readonly List _order = new(); public void Add(string caption, string type) { if (string.IsNullOrEmpty(caption)) return; var match = TeamAnnotation.Match(caption); var name = TeamAnnotation.Replace(caption, string.Empty); name = StockPrefix.Replace(name, string.Empty).Trim(); if (name.Length == 0) return; if (!_byName.TryGetValue(name, out var entry)) { entry = new Entry { Type = type }; _byName[name] = entry; _order.Add(name); } if (!match.Success) entry.SawUnannotated = true; else if (match.Groups[1].Value == "CT") entry.SawCt = true; else entry.SawT = true; } public IReadOnlyList Build() { var result = new List(_order.Count); foreach (var name in _order) { var e = _byName[name]; var team = e.SawUnannotated || (e.SawCt && e.SawT) ? "Both" : e.SawCt ? "CT" : e.SawT ? "T" : "Both"; result.Add(new ScrapedWeapon(name, e.Type, team)); } return result; } } }