Files
Operation-Blue-Laminate-v2/BlueLaminate/BlueLaminate.Scraper/Wiki/WikiPageFetcher.cs
2026-05-29 14:00:58 -05:00

52 lines
1.9 KiB
C#

using System.Text.Json;
using HtmlAgilityPack;
namespace BlueLaminate.Scraper.Wiki;
/// <summary>
/// Fetches a rendered page from the Counter-Strike Fandom wiki, shared by all
/// wiki scrapers.
///
/// The rendered HTML pages sit behind Cloudflare, which 403s .NET's TLS
/// fingerprint regardless of headers. The MediaWiki <c>action=parse</c> API is
/// not challenged, so we fetch the same content as JSON from there and return
/// the embedded HTML as a parsed document.
/// </summary>
public sealed class WikiPageFetcher
{
private const string ApiBase = "https://counterstrike.fandom.com/api.php";
private readonly HttpClient _http;
public WikiPageFetcher(HttpClient http) => _http = http;
/// <summary>Loads a wiki page (e.g. "Weapons") as a parsed HTML document.</summary>
public async Task<HtmlDocument> LoadAsync(string page, CancellationToken ct = default)
{
var url = $"{ApiBase}?action=parse&page={Uri.EscapeDataString(page)}&prop=text&format=json";
using var resp = await _http.GetAsync(url, ct);
resp.EnsureSuccessStatusCode();
await using var stream = await resp.Content.ReadAsStreamAsync(ct);
using var json = await JsonDocument.ParseAsync(stream, cancellationToken: ct);
if (json.RootElement.TryGetProperty("error", out var error))
{
var info = error.TryGetProperty("info", out var i) ? i.GetString() : "unknown error";
throw new InvalidOperationException($"Wiki API returned an error for page '{page}': {info}");
}
var html = json.RootElement
.GetProperty("parse")
.GetProperty("text")
.GetProperty("*")
.GetString()
?? throw new InvalidOperationException($"Wiki API response for page '{page}' had no parsed text.");
var doc = new HtmlDocument();
doc.LoadHtml(html);
return doc;
}
}