using System.Text.Json; using HtmlAgilityPack; namespace BlueLaminate.Scraper.Wiki; /// /// Fetches a rendered page from the Counter-Strike Fandom wiki, shared by all /// wiki scrapers. /// /// The rendered HTML pages sit behind Cloudflare, which 403s .NET's TLS /// fingerprint regardless of headers. The MediaWiki action=parse API is /// not challenged, so we fetch the same content as JSON from there and return /// the embedded HTML as a parsed document. /// public sealed class WikiPageFetcher { private const string ApiBase = "https://counterstrike.fandom.com/api.php"; private readonly HttpClient _http; public WikiPageFetcher(HttpClient http) => _http = http; /// Loads a wiki page (e.g. "Weapons") as a parsed HTML document. public async Task LoadAsync(string page, CancellationToken ct = default) { var url = $"{ApiBase}?action=parse&page={Uri.EscapeDataString(page)}&prop=text&format=json"; using var resp = await _http.GetAsync(url, ct); resp.EnsureSuccessStatusCode(); await using var stream = await resp.Content.ReadAsStreamAsync(ct); using var json = await JsonDocument.ParseAsync(stream, cancellationToken: ct); if (json.RootElement.TryGetProperty("error", out var error)) { var info = error.TryGetProperty("info", out var i) ? i.GetString() : "unknown error"; throw new InvalidOperationException($"Wiki API returned an error for page '{page}': {info}"); } var html = json.RootElement .GetProperty("parse") .GetProperty("text") .GetProperty("*") .GetString() ?? throw new InvalidOperationException($"Wiki API response for page '{page}' had no parsed text."); var doc = new HtmlDocument(); doc.LoadHtml(html); return doc; } }