52 lines
1.9 KiB
C#
52 lines
1.9 KiB
C#
using System.Text.Json;
|
|
using HtmlAgilityPack;
|
|
|
|
namespace BlueLaminate.Scraper.Wiki;
|
|
|
|
/// <summary>
|
|
/// Fetches a rendered page from the Counter-Strike Fandom wiki, shared by all
|
|
/// wiki scrapers.
|
|
///
|
|
/// The rendered HTML pages sit behind Cloudflare, which 403s .NET's TLS
|
|
/// fingerprint regardless of headers. The MediaWiki <c>action=parse</c> API is
|
|
/// not challenged, so we fetch the same content as JSON from there and return
|
|
/// the embedded HTML as a parsed document.
|
|
/// </summary>
|
|
public sealed class WikiPageFetcher
|
|
{
|
|
private const string ApiBase = "https://counterstrike.fandom.com/api.php";
|
|
|
|
private readonly HttpClient _http;
|
|
|
|
public WikiPageFetcher(HttpClient http) => _http = http;
|
|
|
|
/// <summary>Loads a wiki page (e.g. "Weapons") as a parsed HTML document.</summary>
|
|
public async Task<HtmlDocument> LoadAsync(string page, CancellationToken ct = default)
|
|
{
|
|
var url = $"{ApiBase}?action=parse&page={Uri.EscapeDataString(page)}&prop=text&format=json";
|
|
|
|
using var resp = await _http.GetAsync(url, ct);
|
|
resp.EnsureSuccessStatusCode();
|
|
|
|
await using var stream = await resp.Content.ReadAsStreamAsync(ct);
|
|
using var json = await JsonDocument.ParseAsync(stream, cancellationToken: ct);
|
|
|
|
if (json.RootElement.TryGetProperty("error", out var error))
|
|
{
|
|
var info = error.TryGetProperty("info", out var i) ? i.GetString() : "unknown error";
|
|
throw new InvalidOperationException($"Wiki API returned an error for page '{page}': {info}");
|
|
}
|
|
|
|
var html = json.RootElement
|
|
.GetProperty("parse")
|
|
.GetProperty("text")
|
|
.GetProperty("*")
|
|
.GetString()
|
|
?? throw new InvalidOperationException($"Wiki API response for page '{page}' had no parsed text.");
|
|
|
|
var doc = new HtmlDocument();
|
|
doc.LoadHtml(html);
|
|
return doc;
|
|
}
|
|
}
|