173 lines
5.8 KiB
C#
173 lines
5.8 KiB
C#
using System.Text.RegularExpressions;
|
|
using BlueLaminate.Scraper.Wiki;
|
|
using HtmlAgilityPack;
|
|
|
|
namespace BlueLaminate.Scraper.Weapons;
|
|
|
|
/// <summary>
|
|
/// Scrapes the CS2 weapon catalogue from the wiki's "Weapons" page.
|
|
///
|
|
/// Layout: the page has one "tabber" per weapon category, each immediately
|
|
/// preceded by a section heading (the category / Type). Inside each tabber the
|
|
/// "Global Offensive & Counter-Strike 2" tab holds a gallery of captions —
|
|
/// one per weapon, optionally suffixed with "(CT)" or "(T)" for team-locked
|
|
/// weapons.
|
|
/// </summary>
|
|
public sealed class WeaponWikiScraper
|
|
{
|
|
private const string Page = "Weapons";
|
|
private const string Cs2TabHash = "Global_Offensive_&_Counter-Strike_2";
|
|
|
|
// Matches a trailing "(CT)" / "(T)" team annotation, capturing the team.
|
|
private static readonly Regex TeamAnnotation =
|
|
new(@"\s*\((CT|T)\)\s*$", RegexOptions.Compiled);
|
|
|
|
// The wiki labels the default knife "Stock Knife"; drop the prefix.
|
|
private static readonly Regex StockPrefix =
|
|
new(@"^Stock\s+", RegexOptions.Compiled);
|
|
|
|
private readonly WikiPageFetcher _fetcher;
|
|
|
|
public WeaponWikiScraper(WikiPageFetcher fetcher) => _fetcher = fetcher;
|
|
|
|
public async Task<IReadOnlyList<ScrapedWeapon>> ScrapeAsync(CancellationToken ct = default)
|
|
{
|
|
var doc = await _fetcher.LoadAsync(Page, ct);
|
|
|
|
// Headings and tabbers in document order so each tabber inherits the
|
|
// most recent heading as its category.
|
|
var nodes = doc.DocumentNode.SelectNodes(
|
|
"//h2 | //h3 | //h4 | "
|
|
+ "//div[contains(concat(' ', normalize-space(@class), ' '), ' tabber ')]");
|
|
|
|
var aggregator = new WeaponAggregator();
|
|
string? currentType = null;
|
|
|
|
if (nodes is not null)
|
|
{
|
|
foreach (var node in nodes)
|
|
{
|
|
if (node.Name is "h2" or "h3" or "h4")
|
|
{
|
|
currentType = HeadingText(node);
|
|
continue;
|
|
}
|
|
|
|
if (currentType is null)
|
|
continue;
|
|
|
|
foreach (var caption in Cs2Captions(node))
|
|
aggregator.Add(caption, currentType);
|
|
}
|
|
}
|
|
|
|
return aggregator.Build();
|
|
}
|
|
|
|
/// <summary>Caption texts from the CS2 tab of a single tabber, if present.</summary>
|
|
private static IEnumerable<string> Cs2Captions(HtmlNode tabber)
|
|
{
|
|
var tabs = tabber.SelectNodes(
|
|
".//li[contains(concat(' ', normalize-space(@class), ' '), ' wds-tabs__tab ')]");
|
|
if (tabs is null)
|
|
yield break;
|
|
|
|
var index = -1;
|
|
for (var i = 0; i < tabs.Count; i++)
|
|
{
|
|
// HtmlAgilityPack returns attribute values un-decoded, and the wiki
|
|
// entity-encodes the "&" in this hash (&).
|
|
var hash = HtmlEntity.DeEntitize(tabs[i].GetAttributeValue("data-hash", string.Empty));
|
|
if (hash == Cs2TabHash)
|
|
{
|
|
index = i;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (index < 0)
|
|
yield break;
|
|
|
|
var contents = tabber.SelectNodes(
|
|
".//div[contains(concat(' ', normalize-space(@class), ' '), ' wds-tab__content ')]");
|
|
if (contents is null || index >= contents.Count)
|
|
yield break;
|
|
|
|
var captions = contents[index].SelectNodes(
|
|
".//div[contains(concat(' ', normalize-space(@class), ' '), ' lightbox-caption ')]");
|
|
if (captions is null)
|
|
yield break;
|
|
|
|
foreach (var caption in captions)
|
|
yield return WikiText.Normalize(caption.InnerText);
|
|
}
|
|
|
|
private static string HeadingText(HtmlNode heading)
|
|
{
|
|
var headline = heading.SelectSingleNode(
|
|
".//span[contains(concat(' ', normalize-space(@class), ' '), ' mw-headline ')]");
|
|
return WikiText.Normalize((headline ?? heading).InnerText);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Collapses the per-caption rows into one weapon per name, tracking which
|
|
/// teams it appeared for so a weapon shown as both "(CT)" and "(T)" (or with
|
|
/// no annotation) resolves to "Both".
|
|
/// </summary>
|
|
private sealed class WeaponAggregator
|
|
{
|
|
private sealed class Entry
|
|
{
|
|
public required string Type { get; init; }
|
|
public bool SawCt;
|
|
public bool SawT;
|
|
public bool SawUnannotated;
|
|
}
|
|
|
|
private readonly Dictionary<string, Entry> _byName = new();
|
|
private readonly List<string> _order = new();
|
|
|
|
public void Add(string caption, string type)
|
|
{
|
|
if (string.IsNullOrEmpty(caption))
|
|
return;
|
|
|
|
var match = TeamAnnotation.Match(caption);
|
|
var name = TeamAnnotation.Replace(caption, string.Empty);
|
|
name = StockPrefix.Replace(name, string.Empty).Trim();
|
|
if (name.Length == 0)
|
|
return;
|
|
|
|
if (!_byName.TryGetValue(name, out var entry))
|
|
{
|
|
entry = new Entry { Type = type };
|
|
_byName[name] = entry;
|
|
_order.Add(name);
|
|
}
|
|
|
|
if (!match.Success)
|
|
entry.SawUnannotated = true;
|
|
else if (match.Groups[1].Value == "CT")
|
|
entry.SawCt = true;
|
|
else
|
|
entry.SawT = true;
|
|
}
|
|
|
|
public IReadOnlyList<ScrapedWeapon> Build()
|
|
{
|
|
var result = new List<ScrapedWeapon>(_order.Count);
|
|
foreach (var name in _order)
|
|
{
|
|
var e = _byName[name];
|
|
var team =
|
|
e.SawUnannotated || (e.SawCt && e.SawT) ? "Both"
|
|
: e.SawCt ? "CT"
|
|
: e.SawT ? "T"
|
|
: "Both";
|
|
result.Add(new ScrapedWeapon(name, e.Type, team));
|
|
}
|
|
return result;
|
|
}
|
|
}
|
|
}
|