From b9294585ed0ab8af506ddc50cceaf0f09164d344 Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Mon, 1 Jun 2026 10:39:43 +0100 Subject: [PATCH 01/21] Deduplicate and refactor TextAugmentedHandler Deduplication: when injecting search services, rendering links, canvas annotation refs, manifest annotations, and figures into an existing manifest, skip any entry whose id is already present. Consolidates the check into a single AddIfNew(array, item, prepend) helper. Refactor: extract the five functional sections of Handle into named private methods (InjectSearchServices, InjectRenderingLinks, InjectCanvasAnnotationRefs, InjectManifestAnnotationsRefAsync, InjectFiguresRefAsync), leaving Handle as a slim orchestrator. Tests: adds 8 deduplication tests covering service, rendering, and annotations blocks. Co-Authored-By: Claude Sonnet 4.6 --- .../TextAugmented/TextAugmentedQuery.cs | 284 ++++++++++-------- .../SearchApi/TextAugmentedHandlerTests.cs | 144 +++++++++ 2 files changed, 298 insertions(+), 130 deletions(-) diff --git a/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedQuery.cs b/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedQuery.cs index c295965..fcb3566 100644 --- a/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedQuery.cs +++ b/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedQuery.cs @@ -1,5 +1,6 @@ using System.Text.Json.Nodes; using MediatR; +using TextServices.Core.Models; using TextServices.Search.Api.Services; using TextServices.Storage; @@ -33,38 +34,49 @@ public class TextAugmentedHandler(ITextStore textStore, ITextCache textCache) var node = JsonNode.Parse(json); if (node is not JsonObject manifest) return null; - // Replace @id / id with the text-augmented URL for this manifest. - if (manifest.ContainsKey("@id")) - manifest["@id"] = request.SelfUrl; - else - manifest["id"] = request.SelfUrl; + var baseUrl = request.SearchBaseUrl; + var id = request.UrlId ?? request.Id; + var text = await textCache.GetTextAsync(request.Id, ct); - // Build service descriptors using Presentation 3 id/type conventions. - // v2 is listed first; v1 follows for backward-compatible clients. + ReplaceSelfUrl(manifest, request.SelfUrl); + InjectSearchServices(manifest, baseUrl, id); + InjectRenderingLinks(manifest, baseUrl, id, text); + InjectCanvasAnnotationRefs(manifest, baseUrl, id, text); + await InjectManifestAnnotationsRefAsync(manifest, baseUrl, id, request.Id, ct); + await InjectFiguresRefAsync(manifest, baseUrl, id, request.Id, ct); - // TODO - de-duplicate when adding services - var base_ = request.SearchBaseUrl; - var id = request.UrlId ?? request.Id; + return manifest; + } + private static void ReplaceSelfUrl(JsonObject manifest, string selfUrl) + { + if (manifest.ContainsKey("@id")) + manifest["@id"] = selfUrl; + else + manifest["id"] = selfUrl; + } + + private static void InjectSearchServices(JsonObject manifest, string baseUrl, string id) + { var searchServiceV2 = new JsonObject { - ["id"] = $"{base_}/search/v2/{id}", // TODO - can we build these from a central place? + ["id"] = $"{baseUrl}/search/v2/{id}", // TODO - can we build these from a central place? ["type"] = "SearchService2", ["service"] = new JsonArray(new JsonObject { - ["id"] = $"{base_}/autocomplete/v2/{id}", + ["id"] = $"{baseUrl}/autocomplete/v2/{id}", ["type"] = "AutoCompleteService2", }), }; var searchServiceV1 = new JsonObject { - ["id"] = $"{base_}/search/v1/{id}", + ["id"] = $"{baseUrl}/search/v1/{id}", ["type"] = "SearchService1", ["profile"] = "http://iiif.io/api/search/1/search", ["service"] = new JsonArray(new JsonObject { - ["id"] = $"{base_}/autocomplete/v1/{id}", + ["id"] = $"{baseUrl}/autocomplete/v1/{id}", ["type"] = "AutoCompleteService1", ["profile"] = "http://iiif.io/api/search/1/autocomplete", }), @@ -72,161 +84,173 @@ public class TextAugmentedHandler(ITextStore textStore, ITextCache textCache) // Insert v2 then v1 at position 0 so v2 appears first. // Existing services (e.g. IIIF Image services) are pushed down, not displaced. + // Deduplication: skip insertion when a service with the same "id" already exists. if (manifest["service"] is JsonArray existingArray) { - existingArray.Insert(0, searchServiceV1); - existingArray.Insert(0, searchServiceV2); + AddIfNew(existingArray, searchServiceV1); + AddIfNew(existingArray, searchServiceV2); } else if (manifest["service"] is JsonObject existingObject) { // Spec allows service to be a single object — promote to array. - manifest["service"] = new JsonArray( - searchServiceV2, searchServiceV1, existingObject.DeepClone()); + var arr = new JsonArray(existingObject.DeepClone()); + AddIfNew(arr, searchServiceV1); + AddIfNew(arr, searchServiceV2); + manifest["service"] = arr; } else { manifest["service"] = new JsonArray(searchServiceV2, searchServiceV1); } + } + + private static void InjectRenderingLinks(JsonObject manifest, string baseUrl, string id, Text? text) + { + if (text == null) return; - // ---- rendering links (PDF + plain text) --------------------------------- - // Plain text is always added when text artefacts exist. // PDF is only added when at least one canvas is image-based (IsTemporalContent == false); // temporal-only manifests (VTT/audio/video) have no page images to render into a PDF. - var text = await textCache.GetTextAsync(request.Id, ct); - if (text != null) + var hasImageCanvases = text.Images.Any(img => !img.IsTemporalContent); + + var textRef = new JsonObject + { + ["id"] = $"{baseUrl}/text/v1/{id}", + ["type"] = "Text", + ["label"] = new JsonObject { ["en"] = new JsonArray("View as plain text") }, + ["format"] = "text/plain", + }; + + if (manifest["rendering"] is JsonArray existingRendering) + { + AddIfNew(existingRendering, textRef); + if (hasImageCanvases) AddIfNew(existingRendering, BuildPdfRef(baseUrl, id)); + } + else if (manifest["rendering"] is JsonObject singleRendering) + { + var arr = new JsonArray(singleRendering.DeepClone()); + AddIfNew(arr, textRef); + if (hasImageCanvases) AddIfNew(arr, BuildPdfRef(baseUrl, id)); + manifest["rendering"] = arr; + } + else { - var hasImageCanvases = text.Images.Any(img => !img.IsTemporalContent); + manifest["rendering"] = hasImageCanvases + ? new JsonArray(BuildPdfRef(baseUrl, id), textRef) + : new JsonArray(textRef); + } + } - var textRef = new JsonObject + private static void InjectCanvasAnnotationRefs(JsonObject manifest, string baseUrl, string id, Text? text) + { + if (text == null || manifest["items"] is not JsonArray canvases) return; + + // Build a set of canvas indices that have at least one word, in O(words). + var canvasesWithWords = text.Words.Values.Select(w => w.Idx).ToHashSet(); + + for (var i = 0; i < canvases.Count && i < text.Images.Length; i++) + { + if (!canvasesWithWords.Contains(i)) continue; + if (canvases[i] is not JsonObject canvas) continue; + + var linesRef = new JsonObject { - ["id"] = $"{base_}/text/v1/{id}", - ["type"] = "Text", - ["label"] = new JsonObject { ["en"] = new JsonArray("View as plain text") }, - ["format"] = "text/plain", + ["id"] = $"{baseUrl}/annotations/lines/v1/{i}/{id}", + ["type"] = "AnnotationPage", + ["label"] = new JsonObject { ["en"] = new JsonArray("Line-level transcription") }, + }; + var wordsRef = new JsonObject + { + ["id"] = $"{baseUrl}/annotations/words/v1/{i}/{id}", + ["type"] = "AnnotationPage", + ["label"] = new JsonObject { ["en"] = new JsonArray("Word-level transcription") }, }; - if (manifest["rendering"] is JsonArray existingRendering) + if (canvas["annotations"] is JsonArray existingAnnos) { - existingRendering.Insert(0, textRef); - if (hasImageCanvases) - { - var pdfRef = BuildPdfRef(base_, id); - existingRendering.Insert(0, pdfRef); - } + AddIfNew(existingAnnos, wordsRef); + AddIfNew(existingAnnos, linesRef); } - else if (manifest["rendering"] is JsonObject singleRendering) + else if (canvas["annotations"] is JsonObject singleAnno) { - manifest["rendering"] = hasImageCanvases - ? new JsonArray(BuildPdfRef(base_, id), textRef, singleRendering.DeepClone()) - : new JsonArray(textRef, singleRendering.DeepClone()); + var arr = new JsonArray(singleAnno.DeepClone()); + AddIfNew(arr, wordsRef); + AddIfNew(arr, linesRef); + canvas["annotations"] = arr; } else { - manifest["rendering"] = hasImageCanvases - ? new JsonArray(BuildPdfRef(base_, id), textRef) - : new JsonArray(textRef); + canvas["annotations"] = new JsonArray(linesRef, wordsRef); } } + } - // ---- per-canvas line and word annotation page references ---------------- - // Inject line-level and word-level annotation page links into each canvas's - // annotations array so harvesting clients can discover and fetch them. - // Only canvases that actually have words are decorated; sparse canvases are skipped. - if (text != null && manifest["items"] is JsonArray canvases) + private async Task InjectManifestAnnotationsRefAsync( + JsonObject manifest, string baseUrl, string id, string key, CancellationToken ct) + { + var annotationsJson = await textStore.LoadAnnotations(key); + if (annotationsJson == null) return; + + var annotationsRef = new JsonObject { - // Build a set of canvas indices that have at least one word, in O(words). - var canvasesWithWords = text.Words.Values.Select(w => w.Idx).ToHashSet(); + ["id"] = $"{baseUrl}/annotations/manifest/v1/{id}", + ["type"] = "AnnotationPage", + ["profile"] = "https://dlcs.io/profiles/all-text", + ["label"] = new JsonObject { ["en"] = new JsonArray("Text of all canvases") }, + }; - for (var i = 0; i < canvases.Count && i < text.Images.Length; i++) - { - if (!canvasesWithWords.Contains(i)) continue; - if (canvases[i] is not JsonObject canvas) continue; - - var linesRef = new JsonObject - { - ["id"] = $"{base_}/annotations/lines/v1/{i}/{id}", - ["type"] = "AnnotationPage", - ["label"] = new JsonObject { ["en"] = new JsonArray("Line-level transcription") }, - }; - var wordsRef = new JsonObject - { - ["id"] = $"{base_}/annotations/words/v1/{i}/{id}", - ["type"] = "AnnotationPage", - ["label"] = new JsonObject { ["en"] = new JsonArray("Word-level transcription") }, - }; - - if (canvas["annotations"] is JsonArray existingAnnos) - { - existingAnnos.Insert(0, wordsRef); - existingAnnos.Insert(0, linesRef); - } - else if (canvas["annotations"] is JsonObject singleAnno) - { - canvas["annotations"] = new JsonArray( - linesRef, wordsRef, singleAnno.DeepClone()); - } - else - { - canvas["annotations"] = new JsonArray(linesRef, wordsRef); - } - } + if (manifest["annotations"] is JsonArray existingAnnos) + { + AddIfNew(existingAnnos, annotationsRef); } - - // ---- manifest-level line annotations reference -------------------------- - // If the builder stored an annotations.json (all canvases, line granularity), - // add a manifest-level annotations reference so clients can discover it. - var annotationsUrl = $"{base_}/annotations/manifest/v1/{id}"; - var annotationsJson = await textStore.LoadAnnotations(request.Id); - if (annotationsJson != null) + else if (manifest["annotations"] is JsonObject singleAnno) { - var annotationsRef = new JsonObject - { - ["id"] = annotationsUrl, - ["type"] = "AnnotationPage", - ["profile"] = "https://dlcs.io/profiles/all-text", - ["label"] = new JsonObject { ["en"] = new JsonArray("Text of all canvases") }, - }; - - if (manifest["annotations"] is JsonArray existingAnnos) - existingAnnos.Insert(0, annotationsRef); - else if (manifest["annotations"] is JsonObject singleAnno) - manifest["annotations"] = new JsonArray(annotationsRef, singleAnno.DeepClone()); - else - manifest["annotations"] = new JsonArray(annotationsRef); + var arr = new JsonArray(singleAnno.DeepClone()); + AddIfNew(arr, annotationsRef); + manifest["annotations"] = arr; + } + else + { + manifest["annotations"] = new JsonArray(annotationsRef); } + } + + private async Task InjectFiguresRefAsync( + JsonObject manifest, string baseUrl, string id, string key, CancellationToken ct) + { + var figuresJson = await textStore.LoadFigures(key); + if (figuresJson == null) return; - // ---- figures annotation page reference ---------------------------------- - // If the builder stored a figures.json (ComposedBlocks with non-zero area), - // add a manifest-level annotations reference so clients can discover it. - var figuresUrl = $"{base_}/identified/figures/{id}"; - var figuresJson = await textStore.LoadFigures(request.Id); - if (figuresJson != null) + var figuresRef = new JsonObject { - var figuresRef = new JsonObject + ["id"] = $"{baseUrl}/identified/figures/{id}", + ["type"] = "AnnotationPage", + ["label"] = new JsonObject { - ["id"] = figuresUrl, - ["type"] = "AnnotationPage", - ["label"] = new JsonObject - { - ["en"] = new JsonArray("Figures, tables and illustrations"), - }, - }; + ["en"] = new JsonArray("Figures, tables and illustrations"), + }, + }; - if (manifest["annotations"] is JsonArray existingAnnos) - { - existingAnnos.Add(figuresRef); - } - else if (manifest["annotations"] is JsonObject singleAnno) - { - manifest["annotations"] = new JsonArray(singleAnno.DeepClone(), figuresRef); - } - else - { - manifest["annotations"] = new JsonArray(figuresRef); - } + if (manifest["annotations"] is JsonArray existingAnnos) + { + AddIfNew(existingAnnos, figuresRef, prepend: false); + } + else if (manifest["annotations"] is JsonObject singleAnno) + { + var arr = new JsonArray(singleAnno.DeepClone()); + AddIfNew(arr, figuresRef, prepend: false); + manifest["annotations"] = arr; } + else + { + manifest["annotations"] = new JsonArray(figuresRef); + } + } - return manifest; + private static void AddIfNew(JsonArray array, JsonObject item, bool prepend = true) + { + if (array.Any(n => n?["id"]?.GetValue() == item["id"]?.GetValue())) return; + if (prepend) array.Insert(0, item); + else array.Add(item); } private static JsonObject BuildPdfRef(string baseUrl, string id) => new() diff --git a/src/TextServices.Tests/SearchApi/TextAugmentedHandlerTests.cs b/src/TextServices.Tests/SearchApi/TextAugmentedHandlerTests.cs index 01ca794..2de04a3 100644 --- a/src/TextServices.Tests/SearchApi/TextAugmentedHandlerTests.cs +++ b/src/TextServices.Tests/SearchApi/TextAugmentedHandlerTests.cs @@ -296,6 +296,126 @@ public async Task Handle_WithoutAnnotations_NoManifestAnnotationsRef() result!["annotations"].ShouldBeNull(); } + // ------------------------------------------------------------------------- + // Service deduplication + // ------------------------------------------------------------------------- + + [Fact] + public async Task Handle_ServiceArrayAlreadyContainsSearchV2_DoesNotDuplicateV2() + { + var handler = MakeHandler(V3ManifestWithExistingSearchV2()); + + var result = await handler.Handle( + new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + + var service = result!["service"].ShouldBeOfType(); + service.Count.ShouldBe(2); // original v2 stays; v1 added; no duplicate v2 + service.Count(n => n?["id"]?.GetValue() == ExpectedSearchV2).ShouldBe(1); + service.Count(n => n?["id"]?.GetValue() == ExpectedSearchV1).ShouldBe(1); + } + + [Fact] + public async Task Handle_ServiceArrayAlreadyContainsBothSearchServices_NeitherDuplicated() + { + var handler = MakeHandler(V3ManifestWithBothSearchServices()); + + var result = await handler.Handle( + new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + + var service = result!["service"].ShouldBeOfType(); + service.Count.ShouldBe(2); // unchanged — both already present + service.Count(n => n?["id"]?.GetValue() == ExpectedSearchV2).ShouldBe(1); + service.Count(n => n?["id"]?.GetValue() == ExpectedSearchV1).ShouldBe(1); + } + + [Fact] + public async Task Handle_ServiceObjectMatchesSearchV2_PromotesToArrayWithoutDuplicatingV2() + { + var handler = MakeHandler(V3ManifestWithSearchV2ServiceObject()); + + var result = await handler.Handle( + new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + + var service = result!["service"].ShouldBeOfType(); + service.Count.ShouldBe(2); // v1 added; original v2 kept; no duplicate v2 + service.Count(n => n?["id"]?.GetValue() == ExpectedSearchV2).ShouldBe(1); + service.Count(n => n?["id"]?.GetValue() == ExpectedSearchV1).ShouldBe(1); + } + + // ------------------------------------------------------------------------- + // Rendering deduplication + // ------------------------------------------------------------------------- + + [Fact] + public async Task Handle_RenderingArrayAlreadyContainsTextRef_DoesNotDuplicateTextRef() + { + var handler = MakeHandler(V3ManifestWithExistingTextRef(), cachedText: SpatialText()); + + var result = await handler.Handle( + new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + + var rendering = result!["rendering"].ShouldBeOfType(); + rendering.Count(n => n?["id"]?.GetValue() == ExpectedRawTextUrl).ShouldBe(1); + } + + [Fact] + public async Task Handle_RenderingArrayAlreadyContainsPdfRef_DoesNotDuplicatePdfRef() + { + var handler = MakeHandler(V3ManifestWithExistingPdfRef(), cachedText: SpatialText()); + + var result = await handler.Handle( + new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + + var rendering = result!["rendering"].ShouldBeOfType(); + rendering.Count(n => n?["id"]?.GetValue() == ExpectedPdfUrl).ShouldBe(1); + } + + [Fact] + public async Task Handle_RenderingArrayAlreadyContainsBothRefs_NeitherDuplicated() + { + var handler = MakeHandler(V3ManifestWithExistingPdfAndTextRefs(), cachedText: SpatialText()); + + var result = await handler.Handle( + new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + + var rendering = result!["rendering"].ShouldBeOfType(); + rendering.Count.ShouldBe(2); // unchanged + rendering.Count(n => n?["id"]?.GetValue() == ExpectedPdfUrl).ShouldBe(1); + rendering.Count(n => n?["id"]?.GetValue() == ExpectedRawTextUrl).ShouldBe(1); + } + + // ------------------------------------------------------------------------- + // Annotations deduplication + // ------------------------------------------------------------------------- + + [Fact] + public async Task Handle_AnnotationsArrayAlreadyContainsAnnotationsRef_DoesNotDuplicate() + { + const string annotationsJson = """{"type":"AnnotationPage","items":[]}"""; + var handler = MakeHandler(V3ManifestWithExistingAnnotationsRef(), annotationsJson: annotationsJson); + + var result = await handler.Handle( + new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + + var annotations = result!["annotations"].ShouldBeOfType(); + annotations.Count(n => n?["id"]?.GetValue() == + "https://search.example.org/annotations/manifest/v1/test/book").ShouldBe(1); + } + + [Fact] + public async Task Handle_AnnotationsArrayAlreadyContainsFiguresRef_DoesNotDuplicate() + { + const string figuresJson = """{"type":"AnnotationPage","items":[]}"""; + var handler = MakeHandler(V3ManifestWithExistingFiguresRef(), figuresJson: figuresJson); + + var result = await handler.Handle( + new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + + var annotations = result!["annotations"].ShouldBeOfType(); + annotations.Count(n => n?["id"]?.GetValue() == + "https://search.example.org/identified/figures/test/book").ShouldBe(1); + } + private static TextAugmentedHandler MakeHandler( string? manifestJson, string? annotationsJson = null, @@ -336,6 +456,30 @@ private static string V3ManifestWithServiceObject() private static string V3ManifestWithRendering() => """{"id":"https://example.org/m/1","type":"Manifest","rendering":[{"id":"https://example.org/pdf","type":"Text","format":"application/pdf"}]}"""; + private static string V3ManifestWithExistingSearchV2() + => """{"id":"https://example.org/m/1","type":"Manifest","service":[{"id":"https://search.example.org/search/v2/test/book","type":"SearchService2"}]}"""; + + private static string V3ManifestWithBothSearchServices() + => """{"id":"https://example.org/m/1","type":"Manifest","service":[{"id":"https://search.example.org/search/v2/test/book","type":"SearchService2"},{"id":"https://search.example.org/search/v1/test/book","type":"SearchService1"}]}"""; + + private static string V3ManifestWithSearchV2ServiceObject() + => """{"id":"https://example.org/m/1","type":"Manifest","service":{"id":"https://search.example.org/search/v2/test/book","type":"SearchService2"}}"""; + + private static string V3ManifestWithExistingTextRef() + => """{"id":"https://example.org/m/1","type":"Manifest","rendering":[{"id":"https://search.example.org/text/v1/test/book","type":"Text","format":"text/plain"}]}"""; + + private static string V3ManifestWithExistingPdfRef() + => """{"id":"https://example.org/m/1","type":"Manifest","rendering":[{"id":"https://search.example.org/pdf/v1/test/book","type":"Text","format":"application/pdf"}]}"""; + + private static string V3ManifestWithExistingPdfAndTextRefs() + => """{"id":"https://example.org/m/1","type":"Manifest","rendering":[{"id":"https://search.example.org/pdf/v1/test/book","type":"Text","format":"application/pdf"},{"id":"https://search.example.org/text/v1/test/book","type":"Text","format":"text/plain"}]}"""; + + private static string V3ManifestWithExistingAnnotationsRef() + => """{"id":"https://example.org/m/1","type":"Manifest","annotations":[{"id":"https://search.example.org/annotations/manifest/v1/test/book","type":"AnnotationPage"}]}"""; + + private static string V3ManifestWithExistingFiguresRef() + => """{"id":"https://example.org/m/1","type":"Manifest","annotations":[{"id":"https://search.example.org/identified/figures/test/book","type":"AnnotationPage"}]}"""; + private sealed class StubTextStore( string? manifestJson, string? annotationsJson = null, From 6d7ee1ee56b99ee8e6638abf7a66afc900b73ff0 Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Mon, 1 Jun 2026 11:14:56 +0100 Subject: [PATCH 02/21] Add logging to TextAugmentedHandler --- .../Features/EndpointHelpers.cs | 2 +- .../TextAugmented/TextAugmentedQuery.cs | 49 ++++++++++++++----- .../SearchApi/CapabilityGatingTests.cs | 5 +- .../SearchApi/TextAugmentedHandlerTests.cs | 4 +- 4 files changed, 43 insertions(+), 17 deletions(-) diff --git a/src/TextServices.Search.Api/Features/EndpointHelpers.cs b/src/TextServices.Search.Api/Features/EndpointHelpers.cs index 5e97e58..bea501e 100644 --- a/src/TextServices.Search.Api/Features/EndpointHelpers.cs +++ b/src/TextServices.Search.Api/Features/EndpointHelpers.cs @@ -12,7 +12,7 @@ namespace TextServices.Search.Api.Features; /// /// Absolute URL for this endpoint response (base + route prefix + effective id + query). /// Absolute URL without query string. Used as the base for child resource IDs (e.g. annotations). -/// Scheme + authority only (no path). Used by TextAugmented to build cross-endpoint service URLs. +/// Scheme + authority only (no path). Configured URL or based on x-forwarded-host if valid. internal record ResolvedRequest(string EffectiveId, string SelfUrl, string ResourceUrl, string BaseUrl); internal static class EndpointHelpers diff --git a/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedQuery.cs b/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedQuery.cs index fcb3566..68ad44e 100644 --- a/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedQuery.cs +++ b/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedQuery.cs @@ -14,6 +14,10 @@ namespace TextServices.Search.Api.Features.TextAugmented; /// No @context is emitted inside service blocks — it belongs only at document level. /// /// Storage key used to load artefacts from the text store. +/// Absolute URL for this endpoint response (base + route prefix + effective id + query). +/// +/// Scheme + authority only (no path). Used by TextAugmented to build cross-endpoint service URLs +/// /// /// Id to use when generating IIIF service URLs. Differs from when the /// request arrived via a proxy that rewrites the path (X-Forwarded-Path). Defaults to @@ -22,17 +26,29 @@ namespace TextServices.Search.Api.Features.TextAugmented; public record TextAugmentedRequest(string Id, string SelfUrl, string SearchBaseUrl, string? UrlId = null) : IRequest; -public class TextAugmentedHandler(ITextStore textStore, ITextCache textCache) +public class TextAugmentedHandler(ITextStore textStore, ITextCache textCache, ILogger logger) : IRequestHandler { public async Task Handle(TextAugmentedRequest request, CancellationToken ct) { - if (!await textCache.IsEnabledAsync(request.Id, JobServices.TextAugmented, ct)) return null; + if (!await textCache.IsEnabledAsync(request.Id, JobServices.TextAugmented, ct)) + { + logger.LogDebug("Text augmentation is not enabled: {Id}", request.Id); + return null; + } var json = await textStore.LoadManifest(request.Id); - if (json == null) return null; + if (json == null) + { + logger.LogDebug("Manifest not found: {Id}", request.Id); + return null; + } var node = JsonNode.Parse(json); - if (node is not JsonObject manifest) return null; + if (node is not JsonObject manifest) + { + logger.LogDebug("Manifest not JSON: {Id}", request.Id); + return null; + } var baseUrl = request.SearchBaseUrl; var id = request.UrlId ?? request.Id; @@ -42,8 +58,8 @@ public class TextAugmentedHandler(ITextStore textStore, ITextCache textCache) InjectSearchServices(manifest, baseUrl, id); InjectRenderingLinks(manifest, baseUrl, id, text); InjectCanvasAnnotationRefs(manifest, baseUrl, id, text); - await InjectManifestAnnotationsRefAsync(manifest, baseUrl, id, request.Id, ct); - await InjectFiguresRefAsync(manifest, baseUrl, id, request.Id, ct); + await InjectManifestAnnotationsRefAsync(manifest, baseUrl, id, request.Id); + await InjectFiguresRefAsync(manifest, baseUrl, id, request.Id); return manifest; } @@ -56,8 +72,9 @@ private static void ReplaceSelfUrl(JsonObject manifest, string selfUrl) manifest["id"] = selfUrl; } - private static void InjectSearchServices(JsonObject manifest, string baseUrl, string id) + private void InjectSearchServices(JsonObject manifest, string baseUrl, string id) { + logger.LogDebug("Adding search-services: {Id}", id); var searchServiceV2 = new JsonObject { ["id"] = $"{baseUrl}/search/v2/{id}", // TODO - can we build these from a central place? @@ -104,10 +121,12 @@ private static void InjectSearchServices(JsonObject manifest, string baseUrl, st } } - private static void InjectRenderingLinks(JsonObject manifest, string baseUrl, string id, Text? text) + private void InjectRenderingLinks(JsonObject manifest, string baseUrl, string id, Text? text) { if (text == null) return; + logger.LogDebug("Adding rendering: {Id}", id); + // PDF is only added when at least one canvas is image-based (IsTemporalContent == false); // temporal-only manifests (VTT/audio/video) have no page images to render into a PDF. var hasImageCanvases = text.Images.Any(img => !img.IsTemporalContent); @@ -140,10 +159,12 @@ private static void InjectRenderingLinks(JsonObject manifest, string baseUrl, st } } - private static void InjectCanvasAnnotationRefs(JsonObject manifest, string baseUrl, string id, Text? text) + private void InjectCanvasAnnotationRefs(JsonObject manifest, string baseUrl, string id, Text? text) { if (text == null || manifest["items"] is not JsonArray canvases) return; + logger.LogDebug("Adding canvas annotations: {Id}", id); + // Build a set of canvas indices that have at least one word, in O(words). var canvasesWithWords = text.Words.Values.Select(w => w.Idx).ToHashSet(); @@ -184,12 +205,13 @@ private static void InjectCanvasAnnotationRefs(JsonObject manifest, string baseU } } - private async Task InjectManifestAnnotationsRefAsync( - JsonObject manifest, string baseUrl, string id, string key, CancellationToken ct) + private async Task InjectManifestAnnotationsRefAsync(JsonObject manifest, string baseUrl, string id, string key) { var annotationsJson = await textStore.LoadAnnotations(key); if (annotationsJson == null) return; + logger.LogDebug("Adding Manifest annotations: {Id}", id); + var annotationsRef = new JsonObject { ["id"] = $"{baseUrl}/annotations/manifest/v1/{id}", @@ -214,12 +236,13 @@ private async Task InjectManifestAnnotationsRefAsync( } } - private async Task InjectFiguresRefAsync( - JsonObject manifest, string baseUrl, string id, string key, CancellationToken ct) + private async Task InjectFiguresRefAsync(JsonObject manifest, string baseUrl, string id, string key) { var figuresJson = await textStore.LoadFigures(key); if (figuresJson == null) return; + logger.LogDebug("Adding figures: {Id}", id); + var figuresRef = new JsonObject { ["id"] = $"{baseUrl}/identified/figures/{id}", diff --git a/src/TextServices.Tests/SearchApi/CapabilityGatingTests.cs b/src/TextServices.Tests/SearchApi/CapabilityGatingTests.cs index 1e4ecbd..50c1dd7 100644 --- a/src/TextServices.Tests/SearchApi/CapabilityGatingTests.cs +++ b/src/TextServices.Tests/SearchApi/CapabilityGatingTests.cs @@ -1,3 +1,4 @@ +using Microsoft.Extensions.Logging.Abstractions; using Shouldly; using TextServices.Core.Models; using TextServices.Search.Api.Features.Annotations; @@ -159,8 +160,8 @@ public async Task WordAnnotations_WhenAnnotationsFlagAbsent_ReturnsNull() [Fact] public async Task TextAugmented_WhenTextAugmentedFlagAbsent_ReturnsNull() { - var handler = new TextAugmentedHandler( - new AllNullStore(), new StubTextCache(JobServices.Search)); + var handler = new TextAugmentedHandler(new AllNullStore(), new StubTextCache(JobServices.Search), + new NullLogger()); var result = await handler.Handle( new TextAugmentedRequest(Id, SelfUrl, SearchBase), CancellationToken.None); diff --git a/src/TextServices.Tests/SearchApi/TextAugmentedHandlerTests.cs b/src/TextServices.Tests/SearchApi/TextAugmentedHandlerTests.cs index 2de04a3..651205b 100644 --- a/src/TextServices.Tests/SearchApi/TextAugmentedHandlerTests.cs +++ b/src/TextServices.Tests/SearchApi/TextAugmentedHandlerTests.cs @@ -1,4 +1,5 @@ using System.Text.Json.Nodes; +using Microsoft.Extensions.Logging.Abstractions; using Shouldly; using TextServices.Core.Models; using TextServices.Core.Providers; @@ -421,7 +422,8 @@ private static TextAugmentedHandler MakeHandler( string? annotationsJson = null, string? figuresJson = null, Text? cachedText = null) - => new(new StubTextStore(manifestJson, annotationsJson, figuresJson), new StubTextCache(cachedText)); + => new(new StubTextStore(manifestJson, annotationsJson, figuresJson), new StubTextCache(cachedText), + new NullLogger()); /// A Text with one spatial (image-based) canvas. private static Text SpatialText() From b174c439fbc374b3d6656b03c5d8dfd35abd9619 Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Mon, 1 Jun 2026 11:29:49 +0100 Subject: [PATCH 03/21] Add logging + IDE formatting --- .editorconfig | 1 + .../Features/Search/SearchHandlerBase.cs | 21 ++++++++++---- .../Features/Search/SearchQuery.cs | 12 ++++---- .../Features/Search/SearchV2Query.cs | 28 +++++++++---------- src/TextServices.Storage/S3TextStore.cs | 28 +++++++------------ .../Core/TextNormalisationEdgeCaseTests.cs | 4 +-- .../SearchApi/CapabilityGatingTests.cs | 4 +-- .../SearchApi/SearchV2HandlerTests.cs | 25 ++++++++++------- 8 files changed, 65 insertions(+), 58 deletions(-) diff --git a/.editorconfig b/.editorconfig index 7efe752..55bba51 100644 --- a/.editorconfig +++ b/.editorconfig @@ -17,6 +17,7 @@ csharp_space_after_cast = false csharp_prefer_braces = when_multiline:warning csharp_style_expression_bodied_methods = false:suggestion csharp_style_expression_bodied_properties = true:suggestion +csharp_method_or_operator_body = expression_body:suggestion dotnet_style_qualification_for_field = false:warning dotnet_style_qualification_for_property = false:warning diff --git a/src/TextServices.Search.Api/Features/Search/SearchHandlerBase.cs b/src/TextServices.Search.Api/Features/Search/SearchHandlerBase.cs index db9bf17..0d976ff 100644 --- a/src/TextServices.Search.Api/Features/Search/SearchHandlerBase.cs +++ b/src/TextServices.Search.Api/Features/Search/SearchHandlerBase.cs @@ -4,21 +4,30 @@ namespace TextServices.Search.Api.Features.Search; -public abstract class SearchHandlerBase(ITextCache cache) +public abstract class SearchHandlerBase(ITextCache cache, ILogger logger) { protected async Task HandleCore(string id, string query, string selfUrl, string resourceUrl, CancellationToken ct) { - if (!await cache.IsEnabledAsync(id, JobServices.Search, ct)) return default; + if (!await cache.IsEnabledAsync(id, JobServices.Search, ct)) + { + logger.LogDebug("Search is disabled: {Id}", id); + return default; + } - if (string.IsNullOrWhiteSpace(query)) - return EmptyQueryResponse(selfUrl, resourceUrl); + if (string.IsNullOrWhiteSpace(query)) return EmptyQueryResponse(selfUrl); var text = await cache.GetTextAsync(id, ct); - if (text == null) return default; + if (text == null) + { + logger.LogInformation("No text found: {Id}", id); + return default; + } + + logger.LogDebug("Searching {Id} for {Query}", id, query); return BuildResponse(text, text.Search(query), selfUrl, resourceUrl); } - protected abstract TResponse EmptyQueryResponse(string selfUrl, string resourceUrl); + protected abstract TResponse EmptyQueryResponse(string selfUrl); protected abstract TResponse BuildResponse(Text text, List rects, string selfUrl, string resourceUrl); } diff --git a/src/TextServices.Search.Api/Features/Search/SearchQuery.cs b/src/TextServices.Search.Api/Features/Search/SearchQuery.cs index 972fd4e..18d38de 100644 --- a/src/TextServices.Search.Api/Features/Search/SearchQuery.cs +++ b/src/TextServices.Search.Api/Features/Search/SearchQuery.cs @@ -5,18 +5,20 @@ namespace TextServices.Search.Api.Features.Search; -public record SearchRequest(string Id, string Query, string SelfUrl, string ResourceUrl) : IRequest; +public record SearchRequest(string Id, string Query, string SelfUrl, string ResourceUrl) + : IRequest; -public class SearchHandler(ITextCache cache) - : SearchHandlerBase(cache), IRequestHandler +public class SearchHandler(ITextCache cache, ILogger logger) + : SearchHandlerBase(cache, logger), IRequestHandler { public Task Handle(SearchRequest request, CancellationToken ct) => HandleCore(request.Id, request.Query, request.SelfUrl, request.ResourceUrl, ct); - protected override SearchAnnotationList EmptyQueryResponse(string selfUrl, string resourceUrl) => + protected override SearchAnnotationList EmptyQueryResponse(string selfUrl) => new() { Id = selfUrl, Within = new SearchLayer { Total = 0 }, Resources = [], Hits = [] }; - protected override SearchAnnotationList BuildResponse(Text text, List rects, string selfUrl, string resourceUrl) + protected override SearchAnnotationList BuildResponse(Text text, List rects, string selfUrl, + string resourceUrl) { var resources = new List(rects.Count); var hits = new List(); diff --git a/src/TextServices.Search.Api/Features/Search/SearchV2Query.cs b/src/TextServices.Search.Api/Features/Search/SearchV2Query.cs index ae7c0dd..0a5ae98 100644 --- a/src/TextServices.Search.Api/Features/Search/SearchV2Query.cs +++ b/src/TextServices.Search.Api/Features/Search/SearchV2Query.cs @@ -6,18 +6,20 @@ namespace TextServices.Search.Api.Features.Search; -public record SearchV2Request(string Id, string Query, string SelfUrl, string ResourceUrl) : IRequest; +public record SearchV2Request(string Id, string Query, string SelfUrl, string ResourceUrl) + : IRequest; -public class SearchV2Handler(ITextCache cache) - : SearchHandlerBase(cache), IRequestHandler +public class SearchV2Handler(ITextCache cache, ILogger logger) + : SearchHandlerBase(cache, logger), IRequestHandler { public Task Handle(SearchV2Request request, CancellationToken ct) => HandleCore(request.Id, request.Query, request.SelfUrl, request.ResourceUrl, ct); - protected override SearchAnnotationPageV2 EmptyQueryResponse(string selfUrl, string resourceUrl) => + protected override SearchAnnotationPageV2 EmptyQueryResponse(string selfUrl) => new() { Id = selfUrl, Items = [], Annotations = null }; - protected override SearchAnnotationPageV2 BuildResponse(Text text, List rects, string selfUrl, string resourceUrl) + protected override SearchAnnotationPageV2 BuildResponse(Text text, List rects, string selfUrl, + string resourceUrl) { var items = new List(rects.Count); var contexts = new List(); @@ -64,9 +66,8 @@ protected override SearchAnnotationPageV2 BuildResponse(Text text, List + new() { Id = id, Target = new SpecificResourceTarget @@ -124,5 +123,4 @@ private static ContextualizingAnnotation MakeContextualizing( ], }, }; - } } diff --git a/src/TextServices.Storage/S3TextStore.cs b/src/TextServices.Storage/S3TextStore.cs index b23de51..7c81dec 100644 --- a/src/TextServices.Storage/S3TextStore.cs +++ b/src/TextServices.Storage/S3TextStore.cs @@ -13,7 +13,7 @@ namespace TextServices.Storage; /// Job key segments (split on /) become S3 key path components, so the key /// "2/books/my-book" is stored under {KeyPrefix}2/books/my-book/. /// -public class S3TextStore : ITextStore, IDisposable +public class S3TextStore(IOptions options, IAmazonS3 s3) : ITextStore, IDisposable { private const string TextFileName = "text.bin"; private const string AutoCompleteFileName = "autocomplete.bin"; @@ -25,18 +25,10 @@ public class S3TextStore : ITextStore, IDisposable private const string CapabilitiesFileName = "capabilities.json"; private const string PageSequenceFileName = "pagesequence.json"; - private readonly IAmazonS3 _s3; - private readonly string _bucket; - private readonly string _prefix; - - public S3TextStore(IOptions options, IAmazonS3 s3) - { - _s3 = s3; - _bucket = options.Value.BucketName; - _prefix = string.IsNullOrEmpty(options.Value.KeyPrefix) - ? string.Empty - : options.Value.KeyPrefix.TrimEnd('/') + '/'; - } + private readonly string _bucket = options.Value.BucketName; + private readonly string _prefix = string.IsNullOrEmpty(options.Value.KeyPrefix) + ? string.Empty + : options.Value.KeyPrefix.TrimEnd('/') + '/'; /// public async Task SaveText(string key, Text text) @@ -154,7 +146,7 @@ public async Task Exists(string key) { try { - await _s3.GetObjectMetadataAsync(_bucket, GetS3Key(key, TextFileName)); + await s3.GetObjectMetadataAsync(_bucket, GetS3Key(key, TextFileName)); return true; } catch (AmazonS3Exception ex) when (ex.StatusCode == System.Net.HttpStatusCode.NotFound) @@ -209,7 +201,7 @@ public async Task DeleteArtefacts(string key) { try { - await _s3.DeleteObjectAsync(_bucket, GetS3Key(key, fileName)); + await s3.DeleteObjectAsync(_bucket, GetS3Key(key, fileName)); } catch (AmazonS3Exception ex) when (ex.StatusCode == System.Net.HttpStatusCode.NotFound) { @@ -218,7 +210,7 @@ public async Task DeleteArtefacts(string key) } } - public void Dispose() => _s3.Dispose(); + public void Dispose() => s3.Dispose(); // ------------------------------------------------------------------------- // Helpers @@ -230,7 +222,7 @@ private string GetS3Key(string key, string fileName) private async Task PutObjectAsync(string s3Key, MemoryStream stream, string contentType) { stream.Position = 0; - await _s3.PutObjectAsync(new PutObjectRequest + await s3.PutObjectAsync(new PutObjectRequest { BucketName = _bucket, Key = s3Key, @@ -243,7 +235,7 @@ await _s3.PutObjectAsync(new PutObjectRequest { try { - var response = await _s3.GetObjectAsync(_bucket, s3Key); + var response = await s3.GetObjectAsync(_bucket, s3Key); return response.ResponseStream; } catch (AmazonS3Exception ex) when (ex.StatusCode == System.Net.HttpStatusCode.NotFound) diff --git a/src/TextServices.Tests/Core/TextNormalisationEdgeCaseTests.cs b/src/TextServices.Tests/Core/TextNormalisationEdgeCaseTests.cs index 09ae8c6..9f96be4 100644 --- a/src/TextServices.Tests/Core/TextNormalisationEdgeCaseTests.cs +++ b/src/TextServices.Tests/Core/TextNormalisationEdgeCaseTests.cs @@ -154,8 +154,8 @@ public void Normalise_WhitespaceVariants_CollapsedToSingleSpace(string input, st // IMPORTANT KNOWN LIMITATION: accented letters are kept as-is (lowercased). // "café" normalises to "caf\u00e9", NOT "cafe". This means a user searching // "cafe" (no accent) will NOT find "café" in the text. This replicates the - // behaviour of both reference implementations exactly (Wellcome has a TODO - // comment noting this limitation). Do not change this behaviour silently. + // behaviour of both reference implementations exactly. + // Do not change this behaviour silently. // ------------------------------------------------------------------------- [Fact] diff --git a/src/TextServices.Tests/SearchApi/CapabilityGatingTests.cs b/src/TextServices.Tests/SearchApi/CapabilityGatingTests.cs index 50c1dd7..88bb26f 100644 --- a/src/TextServices.Tests/SearchApi/CapabilityGatingTests.cs +++ b/src/TextServices.Tests/SearchApi/CapabilityGatingTests.cs @@ -31,7 +31,7 @@ public class CapabilityGatingTests [Fact] public async Task Search_WhenSearchFlagAbsent_ReturnsNull() { - var handler = new SearchHandler(new StubTextCache(JobServices.Autocomplete)); + var handler = new SearchHandler(new StubTextCache(JobServices.Autocomplete), new NullLogger()); var result = await handler.Handle( new SearchRequest(Id, "hello", SelfUrl, SelfUrl), CancellationToken.None); @@ -43,7 +43,7 @@ public async Task Search_WhenSearchFlagAbsent_ReturnsNull() public async Task Search_WhenCapabilitiesNull_ProceedsToTextLookup() { // null capabilities = all enabled; text = null → null result (text not found) - var handler = new SearchHandler(new StubTextCache(null)); + var handler = new SearchHandler(new StubTextCache(null), new NullLogger()); var result = await handler.Handle( new SearchRequest(Id, "hello", SelfUrl, SelfUrl), CancellationToken.None); diff --git a/src/TextServices.Tests/SearchApi/SearchV2HandlerTests.cs b/src/TextServices.Tests/SearchApi/SearchV2HandlerTests.cs index 558c993..d1562fe 100644 --- a/src/TextServices.Tests/SearchApi/SearchV2HandlerTests.cs +++ b/src/TextServices.Tests/SearchApi/SearchV2HandlerTests.cs @@ -1,4 +1,5 @@ using System.Xml.Linq; +using Microsoft.Extensions.Logging.Abstractions; using Shouldly; using TextServices.Core.Models; using TextServices.Core.Providers; @@ -23,7 +24,7 @@ public class SearchV2HandlerTests [Fact] public async Task Handle_TextNotFound_ReturnsNull() { - var handler = new SearchV2Handler(new StubTextCache(null)); + var handler = GetSearchV2Handler(null); var result = await handler.Handle( new SearchV2Request("missing/book", "hello", SelfUrl, SelfUrl), CancellationToken.None); @@ -35,7 +36,7 @@ public async Task Handle_TextNotFound_ReturnsNull() public async Task Handle_EmptyQuery_ReturnsEmptyResponse() { var text = BuildSpatialText([("https://example.org/c/1", 1000, 1500, "hello world")]); - var handler = new SearchV2Handler(new StubTextCache(text)); + var handler = GetSearchV2Handler(text); var result = await handler.Handle( new SearchV2Request("test/book", "", SelfUrl, SelfUrl), CancellationToken.None); @@ -53,7 +54,7 @@ public async Task Handle_TemporalHit_MotivationIsSupplementing() { var canvasId = "https://example.org/c/audio"; var text = BuildTemporalText(canvasId, "00:00:05.000 --> 00:00:08.500\nHello world\n"); - var handler = new SearchV2Handler(new StubTextCache(text)); + var handler = GetSearchV2Handler(text); var result = await handler.Handle( new SearchV2Request("test/book", "hello", SelfUrl, SelfUrl), CancellationToken.None); @@ -68,7 +69,7 @@ public async Task Handle_TemporalHit_TargetUsesTemporalFragment() { var canvasId = "https://example.org/c/audio"; var text = BuildTemporalText(canvasId, "00:00:05.000 --> 00:00:08.500\nHello world\n"); - var handler = new SearchV2Handler(new StubTextCache(text)); + var handler = GetSearchV2Handler(text); var result = await handler.Handle( new SearchV2Request("test/book", "hello", SelfUrl, SelfUrl), CancellationToken.None); @@ -84,7 +85,7 @@ public async Task Handle_TemporalHit_FragmentUsesDecimalPoint() { var canvasId = "https://example.org/c/audio"; var text = BuildTemporalText(canvasId, "00:00:05.500 --> 00:00:08.750\nHello world\n"); - var handler = new SearchV2Handler(new StubTextCache(text)); + var handler = GetSearchV2Handler(text); var result = await handler.Handle( new SearchV2Request("test/book", "hello", SelfUrl, SelfUrl), CancellationToken.None); @@ -103,7 +104,7 @@ public async Task Handle_TemporalHit_TimesAreInSeconds() var canvasId = "https://example.org/c/audio"; // StartMs = 5000 → 5 seconds var text = BuildTemporalText(canvasId, "00:00:05.000 --> 00:00:08.500\nHello world\n"); - var handler = new SearchV2Handler(new StubTextCache(text)); + var handler = GetSearchV2Handler(text); var result = await handler.Handle( new SearchV2Request("test/book", "hello", SelfUrl, SelfUrl), CancellationToken.None); @@ -122,7 +123,7 @@ public async Task Handle_SpatialHit_MotivationIsPainting() { var canvasId = "https://example.org/c/1"; var text = BuildSpatialText([(canvasId, 1000, 1500, "the quick brown fox")]); - var handler = new SearchV2Handler(new StubTextCache(text)); + var handler = GetSearchV2Handler(text); var result = await handler.Handle( new SearchV2Request("test/book", "quick", SelfUrl, SelfUrl), CancellationToken.None); @@ -140,7 +141,7 @@ public async Task Handle_WithQueryInSelfUrl_AnnotationIdDoesNotContainQueryStrin var selfUrlWithQuery = $"{SelfUrl}?q=quick"; var canvasId = "https://example.org/c/1"; var text = BuildSpatialText([(canvasId, 1000, 1500, "the quick brown fox")]); - var handler = new SearchV2Handler(new StubTextCache(text)); + var handler = GetSearchV2Handler(text); var result = await handler.Handle( new SearchV2Request("test/book", "quick", selfUrlWithQuery, SelfUrl), CancellationToken.None); @@ -158,7 +159,7 @@ public async Task Handle_WithQueryInSelfUrl_ContextualizingAnnotationIdDoesNotCo var selfUrlWithQuery = $"{SelfUrl}?q=quick"; var canvasId = "https://example.org/c/1"; var text = BuildSpatialText([(canvasId, 1000, 1500, "the quick brown fox")]); - var handler = new SearchV2Handler(new StubTextCache(text)); + var handler = GetSearchV2Handler(text); var result = await handler.Handle( new SearchV2Request("test/book", "quick", selfUrlWithQuery, SelfUrl), CancellationToken.None); @@ -179,7 +180,7 @@ public async Task Handle_TemporalHit_ContextualizingAnnotationPresent() { var canvasId = "https://example.org/c/audio"; var text = BuildTemporalText(canvasId, "00:00:05.000 --> 00:00:08.500\nHello world\n"); - var handler = new SearchV2Handler(new StubTextCache(text)); + var handler = GetSearchV2Handler(text); var result = await handler.Handle( new SearchV2Request("test/book", "hello", SelfUrl, SelfUrl), CancellationToken.None); @@ -193,6 +194,10 @@ public async Task Handle_TemporalHit_ContextualizingAnnotationPresent() // ------------------------------------------------------------------------- // Helpers // ------------------------------------------------------------------------- + private static SearchV2Handler GetSearchV2Handler(Text? text) + { + return new SearchV2Handler(new StubTextCache(text), new NullLogger()); + } /// /// Builds a temporal from a canvas id and raw VTT cue block From fbd1b7edbfc13ad8900fa227188bd247ea7672c7 Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Mon, 1 Jun 2026 11:57:13 +0100 Subject: [PATCH 04/21] Add structured logging to storage and text format providers Previously ~90% of files had no logging despite Serilog being fully wired up. Storage I/O failures were silent, and parse events in Core providers (hyphenation merges, missing PrintSpace, timestamp errors) produced no observable output. * Add ILogger to FileSystemTextStore and S3TextStore; log every Save/Load/Delete/Exists at Debug, and unexpected S3 errors at Warning * Add ILoggerFactory? to TextBuilder (default NullLoggerFactory) and thread loggers through to all four format providers * AltoTextFormatProvider: logs namespace, missing PrintSpace (Warning), hyphenation merges (Debug), unmatched HypPart1 (Warning) * HocrTextFormatProvider: logs missing ocr_page, unparseable bbox * VttTextFormatProvider: logs timing-line parse failures (Warning) * W3cAnnotationTextFormatProvider: logs skipped/unparseable annotations * TextBuilder logs Warning when no provider matches a page * Use ActivatorUtilities.CreateInstance for FileSystemTextStore production registrations, consistent with S3TextStore, so future DI-resolvable constructor params are auto-resolved Co-Authored-By: Claude Sonnet 4.6 --- .../ServiceCollectionExtensions.cs | 4 +- .../Jobs/TextBuildJob.cs | 3 +- .../Providers/AltoTextFormatProvider.cs | 16 ++- .../Providers/HocrTextFormatProvider.cs | 22 ++++- .../Providers/TextBuilder.cs | 33 +++++-- .../Providers/VttTextFormatProvider.cs | 48 +++++---- .../W3cAnnotationTextFormatProvider.cs | 21 +++- .../TextServices.Core.csproj | 1 + src/TextServices.Search.Api/Program.cs | 10 +- .../FileSystemTextStore.cs | 99 +++++++++++++++---- src/TextServices.Storage/S3TextStore.cs | 50 +++++++--- .../Infrastructure/BuilderApiFactory.cs | 5 +- .../Infrastructure/SearchApiFactory.cs | 5 +- .../BuilderApi/TextBuildJobTests.cs | 4 +- .../SearchApi/SearchHandlerTests.cs | 19 ++-- .../Storage/FileSystemTextStoreTests.cs | 3 +- .../Storage/S3TextStoreTests.cs | 4 +- 17 files changed, 259 insertions(+), 88 deletions(-) diff --git a/src/TextServices.Builder.Api/Configuration/ServiceCollectionExtensions.cs b/src/TextServices.Builder.Api/Configuration/ServiceCollectionExtensions.cs index c936206..347ac02 100644 --- a/src/TextServices.Builder.Api/Configuration/ServiceCollectionExtensions.cs +++ b/src/TextServices.Builder.Api/Configuration/ServiceCollectionExtensions.cs @@ -81,7 +81,9 @@ public static IServiceCollection AddTextStorage(this IServiceCollection services if (!string.IsNullOrEmpty(configuration["TextServices:Storage:S3:BucketName"])) return ActivatorUtilities.CreateInstance(sp); var storage = sp.GetRequiredService>().Value.Storage; - return new FileSystemTextStore(new FileSystemTextStoreOptions { RootPath = storage.RootPath }); + return ActivatorUtilities.CreateInstance( + sp, + new FileSystemTextStoreOptions { RootPath = storage.RootPath }); }); return services; diff --git a/src/TextServices.Builder.Api/Jobs/TextBuildJob.cs b/src/TextServices.Builder.Api/Jobs/TextBuildJob.cs index fb2a759..9b305e8 100644 --- a/src/TextServices.Builder.Api/Jobs/TextBuildJob.cs +++ b/src/TextServices.Builder.Api/Jobs/TextBuildJob.cs @@ -35,6 +35,7 @@ public class TextBuildJob( ITextStore textStore, IJobNotifier jobNotifier, IOptions options, + ILoggerFactory loggerFactory, ILogger logger) { private const int ProgressBatchSize = 10; @@ -170,7 +171,7 @@ await Parallel.ForEachAsync(Enumerable.Range(0, pagesToFetch.Count), async (i, ct) => { fetched[i] = await FetchPageAsync(pagesToFetch[i], ct); }); // Build text in original canvas order (TextBuilder requires sequential input). - var textBuilder = new TextBuilder(); + var textBuilder = new TextBuilder(loggerFactory); var errors = fetched.Where(r => r.Error != null).Select(r => r.Error!).ToList(); int completed = 0; diff --git a/src/TextServices.Core/Providers/AltoTextFormatProvider.cs b/src/TextServices.Core/Providers/AltoTextFormatProvider.cs index 37c907b..0eae22d 100644 --- a/src/TextServices.Core/Providers/AltoTextFormatProvider.cs +++ b/src/TextServices.Core/Providers/AltoTextFormatProvider.cs @@ -1,5 +1,7 @@ using System.Runtime.CompilerServices; using System.Xml.Linq; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; using TextServices.Core.Models; namespace TextServices.Core.Providers; @@ -9,8 +11,10 @@ namespace TextServices.Core.Providers; /// Supports both ns-v2 (http://www.loc.gov/standards/alto/ns-v2#) and ns-v3 /// (http://www.loc.gov/standards/alto/ns-v3#) as well as namespace-free ALTO. /// -public class AltoTextFormatProvider : ITextFormatProvider +public class AltoTextFormatProvider(ILogger? logger = null) : ITextFormatProvider { + private readonly ILogger _logger = logger ?? NullLogger.Instance; + // Hyphen character used in some ALTO sources as a soft-hyphen marker private const char HyphenSpecial = '¬'; @@ -40,6 +44,8 @@ public void ProcessPage( int canvasHeight) { var ns = DetectAltoNamespace(root); + _logger.LogDebug("Processing ALTO page '{Id}' with {Namespace} namespace", + imageIdentifier, ns == XNamespace.None ? "none" : ns.NamespaceName); var pageElement = FindDescendant(root, ns, "Page"); if (pageElement == null) return; @@ -51,7 +57,11 @@ public void ProcessPage( float scaleH = altoHeight > 0 ? canvasHeight / altoHeight : 1f; var printSpace = FindDescendant(pageElement, ns, "PrintSpace"); - if (printSpace == null) return; + if (printSpace == null) + { + _logger.LogWarning("No PrintSpace found for ALTO page '{Id}'; page will be empty", imageIdentifier); + return; + } accumulator.BeginPage(imageIdentifier); @@ -94,6 +104,7 @@ public void ProcessPage( accumulator.LastWordNormPosition, composedBlockTracker); } + _logger.LogDebug("Merged hyphenated word on page '{Id}'", imageIdentifier); hyphenPending = false; startIdx = 1; // HypPart2 string consumed — skip it below } @@ -143,6 +154,7 @@ public void ProcessPage( // If the document ends with an unmatched HypPart1 (edge case), commit it as-is. if (hyphenPending && !string.IsNullOrEmpty(pendingNorm)) { + _logger.LogWarning("Unmatched HypPart1 at end of ALTO page '{Id}'", imageIdentifier); accumulator.AddWord(pendingRaw, pendingNorm, pendingX, pendingY, pendingW, pendingH, pendingSp); } diff --git a/src/TextServices.Core/Providers/HocrTextFormatProvider.cs b/src/TextServices.Core/Providers/HocrTextFormatProvider.cs index 6ead4ca..8776eb5 100644 --- a/src/TextServices.Core/Providers/HocrTextFormatProvider.cs +++ b/src/TextServices.Core/Providers/HocrTextFormatProvider.cs @@ -1,4 +1,6 @@ using System.Xml.Linq; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; using TextServices.Core.Models; namespace TextServices.Core.Providers; @@ -14,8 +16,10 @@ namespace TextServices.Core.Providers; /// engine producing ocr_word spans. Well-formed XHTML input is required; /// is used by the fetcher before this provider receives the root. /// -public class HocrTextFormatProvider : ITextFormatProvider +public class HocrTextFormatProvider(ILogger? logger = null) : ITextFormatProvider { + private readonly ILogger _logger = logger ?? NullLogger.Instance; + /// /// /// Detects hOCR by looking for "hocr" in the seeAlso URI @@ -39,7 +43,12 @@ public void ProcessPage( int canvasHeight) { // Locate the ocr_page element — may be the root itself or a descendant. - var pageEl = FindByClass(root, "ocr_page").FirstOrDefault() ?? root; + var foundPage = FindByClass(root, "ocr_page").FirstOrDefault(); + if (foundPage == null) + { + _logger.LogDebug("No ocr_page element found for page '{Id}'; using root element", imageIdentifier); + } + var pageEl = foundPage ?? root; // Derive the coordinate scale from the page-level bbox when present. // hOCR bbox coords are x0 y0 x1 y1; for a page starting at 0,0 x1=pageW, y1=pageH. @@ -66,8 +75,13 @@ public void ProcessPage( foreach (var wordEl in wordEls) { - var bbox = ParseBbox(wordEl.Attribute("title")?.Value); - if (bbox is null) continue; + var wordTitle = wordEl.Attribute("title")?.Value; + var bbox = ParseBbox(wordTitle); + if (bbox is null) + { + if (wordTitle != null) _logger.LogDebug("Could not parse bbox for word on page '{Id}'; skipping", imageIdentifier); + continue; + } // Collect all text under this element, ignoring child markup. var rawWord = string.Concat( diff --git a/src/TextServices.Core/Providers/TextBuilder.cs b/src/TextServices.Core/Providers/TextBuilder.cs index 329a6ea..fa9cd15 100644 --- a/src/TextServices.Core/Providers/TextBuilder.cs +++ b/src/TextServices.Core/Providers/TextBuilder.cs @@ -1,4 +1,6 @@ using System.Xml.Linq; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; using TextServices.Core.Models; namespace TextServices.Core.Providers; @@ -23,15 +25,25 @@ public class TextBuilder private readonly IReadOnlyList _providers; private readonly IReadOnlyList _transcriptProviders; private readonly TextAccumulator _accumulator = new(); + private readonly ILogger _logger; /// /// Initialises a with the default set of providers /// (ALTO ns-v2 / ns-v3, hOCR, and VTT). /// - public TextBuilder() : this( - [new AltoTextFormatProvider(), new HocrTextFormatProvider()], - [new VttTextFormatProvider(), new W3cAnnotationTextFormatProvider()]) - { } + public TextBuilder(ILoggerFactory? loggerFactory = null) + { + loggerFactory ??= NullLoggerFactory.Instance; + _logger = loggerFactory.CreateLogger(); + _providers = [ + new AltoTextFormatProvider(loggerFactory.CreateLogger()), + new HocrTextFormatProvider(loggerFactory.CreateLogger()), + ]; + _transcriptProviders = [ + new VttTextFormatProvider(loggerFactory.CreateLogger()), + new W3cAnnotationTextFormatProvider(loggerFactory.CreateLogger()), + ]; + } /// /// Initialises a with an explicit list of providers, @@ -40,6 +52,7 @@ public TextBuilder() : this( /// public TextBuilder(IReadOnlyList providers, IReadOnlyList? transcriptProviders = null) { + _logger = NullLogger.Instance; _providers = providers; _transcriptProviders = transcriptProviders ?? []; } @@ -66,7 +79,11 @@ public void AddPage( if (root == null) return; var provider = _providers.FirstOrDefault(p => p.Supports(profile, label)); - if (provider == null) return; + if (provider == null) + { + _logger.LogWarning("No provider found for page '{Id}' (profile='{Profile}', label='{Label}')", id, profile, label); + return; + } provider.ProcessPage(_accumulator, root, id, canvasWidth, canvasHeight); } @@ -85,7 +102,11 @@ public void AddTranscriptPage( { if (rawContent == null) return; var provider = _transcriptProviders.FirstOrDefault(p => p.Supports(profile, format, label)); - if (provider == null) return; + if (provider == null) + { + _logger.LogWarning("No provider found for transcript page '{Id}' (profile='{Profile}', format='{Format}', label='{Label}')", id, profile, format, label); + return; + } provider.ProcessPage(_accumulator, rawContent, id, canvasWidth, canvasHeight); } diff --git a/src/TextServices.Core/Providers/VttTextFormatProvider.cs b/src/TextServices.Core/Providers/VttTextFormatProvider.cs index 28462e5..a86b505 100644 --- a/src/TextServices.Core/Providers/VttTextFormatProvider.cs +++ b/src/TextServices.Core/Providers/VttTextFormatProvider.cs @@ -1,3 +1,5 @@ +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; using TextServices.Core.Models; namespace TextServices.Core.Providers; @@ -6,8 +8,10 @@ namespace TextServices.Core.Providers; /// Processes WebVTT caption/subtitle files into the text index. /// Each VTT cue becomes a set of words sharing the same time range and line number. /// -public class VttTextFormatProvider : IStringFormatProvider +public class VttTextFormatProvider(ILogger? logger = null) : IStringFormatProvider { + private readonly ILogger _logger = logger ?? NullLogger.Instance; + public bool Supports(string? profile, string? format, string? label) => ContainsIgnoreCase(profile, "text/vtt") || ContainsIgnoreCase(format, "text/vtt") || ContainsIgnoreCase(profile, "vtt") || ContainsIgnoreCase(format, "vtt") || @@ -43,7 +47,15 @@ public void ProcessPage( // Timing line var timingLine = lines[i++]; - var (startMs, endMs) = ParseTimingLine(timingLine); + int startMs = 0, endMs = 0; + try + { + (startMs, endMs) = ParseTimingLine(timingLine); + } + catch (Exception) + { + _logger.LogWarning("Failed to parse VTT timing line on page '{Id}': {Value}", imageIdentifier, timingLine); + } // Collect cue text lines until blank line var cueLines = new List(); @@ -83,25 +95,21 @@ private static (int startMs, int endMs) ParseTimingLine(string line) private static int ParseTimestamp(string s) { // Accepts HH:MM:SS.mmm or MM:SS.mmm - try + var dotIdx = s.IndexOf('.'); + var ms = dotIdx >= 0 ? int.Parse(s[(dotIdx + 1)..].PadRight(3, '0')[..3]) : 0; + var timePart = dotIdx >= 0 ? s[..dotIdx] : s; + var colonParts = timePart.Split(':'); + return colonParts.Length switch { - var dotIdx = s.IndexOf('.'); - var ms = dotIdx >= 0 ? int.Parse(s[(dotIdx + 1)..].PadRight(3, '0')[..3]) : 0; - var timePart = dotIdx >= 0 ? s[..dotIdx] : s; - var colonParts = timePart.Split(':'); - return colonParts.Length switch - { - 3 => int.Parse(colonParts[0]) * 3_600_000 - + int.Parse(colonParts[1]) * 60_000 - + int.Parse(colonParts[2]) * 1_000 - + ms, - 2 => int.Parse(colonParts[0]) * 60_000 - + int.Parse(colonParts[1]) * 1_000 - + ms, - _ => 0, - }; - } - catch { return 0; } + 3 => int.Parse(colonParts[0]) * 3_600_000 + + int.Parse(colonParts[1]) * 60_000 + + int.Parse(colonParts[2]) * 1_000 + + ms, + 2 => int.Parse(colonParts[0]) * 60_000 + + int.Parse(colonParts[1]) * 1_000 + + ms, + _ => 0, + }; } /// Strips VTT inline markup tags (speaker tags, timestamp tags, class tags). diff --git a/src/TextServices.Core/Providers/W3cAnnotationTextFormatProvider.cs b/src/TextServices.Core/Providers/W3cAnnotationTextFormatProvider.cs index 613a3e9..6bf5a37 100644 --- a/src/TextServices.Core/Providers/W3cAnnotationTextFormatProvider.cs +++ b/src/TextServices.Core/Providers/W3cAnnotationTextFormatProvider.cs @@ -1,5 +1,7 @@ using System.Globalization; using System.Text.Json; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; using TextServices.Core.Models; namespace TextServices.Core.Providers; @@ -15,8 +17,11 @@ namespace TextServices.Core.Providers; /// All words within one annotation share the same bounding box (or time range) /// and the same line number, so they coalesce correctly in search results. /// -public class W3cAnnotationTextFormatProvider : IStringFormatProvider +public class W3cAnnotationTextFormatProvider(ILogger? logger = null) : IStringFormatProvider { + private readonly ILogger _logger = + logger ?? NullLogger.Instance; + /// /// Internal sentinel value written into PageInstruction.Format by /// ManifestReducer when an externally-referenced AnnotationPage is @@ -45,7 +50,11 @@ public void ProcessPage( foreach (var anno in items.EnumerateArray()) { - if (!HasSupplementingMotivation(anno)) continue; + if (!HasSupplementingMotivation(anno)) + { + _logger.LogTrace("Skipped annotation without 'supplementing' motivation on page '{Id}'", imageIdentifier); + continue; + } if (!anno.TryGetProperty("body", out var bodyEl)) continue; var body = bodyEl.ValueKind == JsonValueKind.Array @@ -65,7 +74,11 @@ public void ProcessPage( if (!anno.TryGetProperty("target", out var targetEl)) continue; var target = ParseTarget(targetEl); - if (target == null) continue; + if (target == null) + { + _logger.LogDebug("Could not parse target for annotation on page '{Id}'; skipping", imageIdentifier); + continue; + } entries.Add((text, target.Value)); } @@ -180,7 +193,7 @@ public void ProcessPage( if (coords.StartsWith("pixel:", StringComparison.OrdinalIgnoreCase)) coords = coords[6..]; else if (coords.StartsWith("percent:", StringComparison.OrdinalIgnoreCase)) - return null; // percentage coordinates not supported + return null; // percentage coordinates not supported — logged by caller var parts = coords.Split(','); if (parts.Length != 4) return null; diff --git a/src/TextServices.Core/TextServices.Core.csproj b/src/TextServices.Core/TextServices.Core.csproj index b35d7ab..19b801a 100644 --- a/src/TextServices.Core/TextServices.Core.csproj +++ b/src/TextServices.Core/TextServices.Core.csproj @@ -7,6 +7,7 @@ + diff --git a/src/TextServices.Search.Api/Program.cs b/src/TextServices.Search.Api/Program.cs index 5546457..ab34191 100644 --- a/src/TextServices.Search.Api/Program.cs +++ b/src/TextServices.Search.Api/Program.cs @@ -61,10 +61,12 @@ // ---- Storage ---------------------------------------------------------------- builder.Services.AddSingleton(sp => - new FileSystemTextStore(new FileSystemTextStoreOptions - { - RootPath = sp.GetRequiredService>().Value.StorageRootPath - })); + ActivatorUtilities.CreateInstance( + sp, + new FileSystemTextStoreOptions + { + RootPath = sp.GetRequiredService>().Value.StorageRootPath + })); // ITextStore is also injected directly into TextAugmentedHandler (manifest is plain JSON, // not routed through the Text/AutoComplete cache). diff --git a/src/TextServices.Storage/FileSystemTextStore.cs b/src/TextServices.Storage/FileSystemTextStore.cs index 46c55a7..6052709 100644 --- a/src/TextServices.Storage/FileSystemTextStore.cs +++ b/src/TextServices.Storage/FileSystemTextStore.cs @@ -1,3 +1,4 @@ +using Microsoft.Extensions.Logging; using ProtoBuf; using TextServices.Core.Models; @@ -11,7 +12,7 @@ namespace TextServices.Storage; /// Job key segments (split on /) become nested subdirectories, so the key /// "2/books/my-book" is stored under {RootPath}/2/books/my-book/. /// -public class FileSystemTextStore : ITextStore +public class FileSystemTextStore(FileSystemTextStoreOptions options, ILogger logger) : ITextStore { private const string TextFileName = "text.bin"; private const string AutoCompleteFileName = "autocomplete.bin"; @@ -23,12 +24,7 @@ public class FileSystemTextStore : ITextStore private const string CapabilitiesFileName = "capabilities.json"; private const string PageSequenceFileName = "pagesequence.json"; - private readonly string _rootPath; - - public FileSystemTextStore(FileSystemTextStoreOptions options) - { - _rootPath = options.RootPath; - } + private readonly string _rootPath = options.RootPath; /// public async Task SaveText(string key, Text text) @@ -37,16 +33,22 @@ public async Task SaveText(string key, Text text) EnsureDirectory(path); await using var stream = File.Create(path); Serializer.Serialize(stream, text); + logger.LogDebug("Saved {Type} for '{Key}'", TextFileName, key); } /// public Task LoadText(string key) { var path = GetPath(key, TextFileName); - if (!File.Exists(path)) return Task.FromResult(null); + if (!File.Exists(path)) + { + logger.LogDebug("{Type} not found for '{Key}'", TextFileName, key); + return Task.FromResult(null); + } using var stream = File.OpenRead(path); var text = Serializer.Deserialize(stream); + logger.LogDebug("Loaded {Type} for '{Key}'", TextFileName, key); return Task.FromResult(text); } @@ -57,16 +59,22 @@ public async Task SaveAutoComplete(string key, AutoComplete autoComplete) EnsureDirectory(path); await using var stream = File.Create(path); Serializer.Serialize(stream, autoComplete); + logger.LogDebug("Saved {Type} for '{Key}'", AutoCompleteFileName, key); } /// public Task LoadAutoComplete(string key) { var path = GetPath(key, AutoCompleteFileName); - if (!File.Exists(path)) return Task.FromResult(null); + if (!File.Exists(path)) + { + logger.LogDebug("{Type} not found for '{Key}'", AutoCompleteFileName, key); + return Task.FromResult(null); + } using var stream = File.OpenRead(path); var ac = Serializer.Deserialize(stream); + logger.LogDebug("Loaded {Type} for '{Key}'", AutoCompleteFileName, key); return Task.FromResult(ac); } @@ -76,14 +84,21 @@ public async Task SaveManifest(string key, string json) var path = GetPath(key, ManifestFileName); EnsureDirectory(path); await File.WriteAllTextAsync(path, json); + logger.LogDebug("Saved {Type} for '{Key}'", ManifestFileName, key); } /// public async Task LoadManifest(string key) { var path = GetPath(key, ManifestFileName); - if (!File.Exists(path)) return null; - return await File.ReadAllTextAsync(path); + if (!File.Exists(path)) + { + logger.LogDebug("{Type} not found for '{Key}'", ManifestFileName, key); + return null; + } + var result = await File.ReadAllTextAsync(path); + logger.LogDebug("Loaded {Type} for '{Key}'", ManifestFileName, key); + return result; } /// @@ -92,14 +107,21 @@ public async Task SaveRawText(string key, string rawText) var path = GetPath(key, RawTextFileName); EnsureDirectory(path); await File.WriteAllTextAsync(path, rawText); + logger.LogDebug("Saved {Type} for '{Key}'", RawTextFileName, key); } /// public async Task LoadRawText(string key) { var path = GetPath(key, RawTextFileName); - if (!File.Exists(path)) return null; - return await File.ReadAllTextAsync(path); + if (!File.Exists(path)) + { + logger.LogDebug("{Type} not found for '{Key}'", RawTextFileName, key); + return null; + } + var result = await File.ReadAllTextAsync(path); + logger.LogDebug("Loaded {Type} for '{Key}'", RawTextFileName, key); + return result; } /// @@ -109,13 +131,19 @@ public async Task SavePdf(string key, Stream pdfStream) EnsureDirectory(path); await using var file = File.Create(path); await pdfStream.CopyToAsync(file); + logger.LogDebug("Saved {Type} for '{Key}'", PdfFileName, key); } /// public Task LoadPdf(string key) { var path = GetPath(key, PdfFileName); - if (!File.Exists(path)) return Task.FromResult(null); + if (!File.Exists(path)) + { + logger.LogDebug("{Type} not found for '{Key}'", PdfFileName, key); + return Task.FromResult(null); + } + logger.LogDebug("Loaded {Type} for '{Key}'", PdfFileName, key); return Task.FromResult(File.OpenRead(path)); } @@ -125,14 +153,21 @@ public async Task SaveFigures(string key, string json) var path = GetPath(key, FiguresFileName); EnsureDirectory(path); await File.WriteAllTextAsync(path, json); + logger.LogDebug("Saved {Type} for '{Key}'", FiguresFileName, key); } /// public async Task LoadFigures(string key) { var path = GetPath(key, FiguresFileName); - if (!File.Exists(path)) return null; - return await File.ReadAllTextAsync(path); + if (!File.Exists(path)) + { + logger.LogDebug("{Type} not found for '{Key}'", FiguresFileName, key); + return null; + } + var result = await File.ReadAllTextAsync(path); + logger.LogDebug("Loaded {Type} for '{Key}'", FiguresFileName, key); + return result; } /// @@ -141,21 +176,30 @@ public async Task SaveAnnotations(string key, string json) var path = GetPath(key, AnnotationsFileName); EnsureDirectory(path); await File.WriteAllTextAsync(path, json); + logger.LogDebug("Saved {Type} for '{Key}'", AnnotationsFileName, key); } /// public async Task LoadAnnotations(string key) { var path = GetPath(key, AnnotationsFileName); - if (!File.Exists(path)) return null; - return await File.ReadAllTextAsync(path); + if (!File.Exists(path)) + { + logger.LogDebug("{Type} not found for '{Key}'", AnnotationsFileName, key); + return null; + } + var result = await File.ReadAllTextAsync(path); + logger.LogDebug("Loaded {Type} for '{Key}'", AnnotationsFileName, key); + return result; } /// public Task Exists(string key) { var path = GetPath(key, TextFileName); - return Task.FromResult(File.Exists(path)); + var exists = File.Exists(path); + logger.LogDebug("Artefact {Exists} for '{Key}'", exists, key); + return Task.FromResult(exists); } /// @@ -164,14 +208,20 @@ public async Task SaveCapabilities(string key, int services) var path = GetPath(key, CapabilitiesFileName); EnsureDirectory(path); await File.WriteAllTextAsync(path, services.ToString()); + logger.LogDebug("Saved {Type} for '{Key}'", CapabilitiesFileName, key); } /// public async Task LoadCapabilities(string key) { var path = GetPath(key, CapabilitiesFileName); - if (!File.Exists(path)) return null; + if (!File.Exists(path)) + { + logger.LogDebug("{Type} not found for '{Key}'", CapabilitiesFileName, key); + return null; + } var text = await File.ReadAllTextAsync(path); + logger.LogDebug("Loaded {Type} for '{Key}'", CapabilitiesFileName, key); return int.TryParse(text.Trim(), out var value) ? value : null; } @@ -181,13 +231,19 @@ public async Task SavePageSequence(string key, string json) var path = GetPath(key, PageSequenceFileName); EnsureDirectory(path); await File.WriteAllTextAsync(path, json); + logger.LogDebug("Saved {Type} for '{Key}'", PageSequenceFileName, key); } /// public Task LoadPageSequence(string key) { var path = GetPath(key, PageSequenceFileName); - if (!File.Exists(path)) return Task.FromResult(null); + if (!File.Exists(path)) + { + logger.LogDebug("{Type} not found for '{Key}'", PageSequenceFileName, key); + return Task.FromResult(null); + } + logger.LogDebug("Loaded {Type} for '{Key}'", PageSequenceFileName, key); return File.ReadAllTextAsync(path)!; } @@ -205,6 +261,7 @@ public Task DeleteArtefacts(string key) var path = GetPath(key, fileName); if (File.Exists(path)) File.Delete(path); } + logger.LogDebug("Deleted artefacts for '{Key}'", key); return Task.CompletedTask; } diff --git a/src/TextServices.Storage/S3TextStore.cs b/src/TextServices.Storage/S3TextStore.cs index 7c81dec..0b0a73d 100644 --- a/src/TextServices.Storage/S3TextStore.cs +++ b/src/TextServices.Storage/S3TextStore.cs @@ -1,5 +1,6 @@ using Amazon.S3; using Amazon.S3.Model; +using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using ProtoBuf; using TextServices.Core.Models; @@ -13,7 +14,7 @@ namespace TextServices.Storage; /// Job key segments (split on /) become S3 key path components, so the key /// "2/books/my-book" is stored under {KeyPrefix}2/books/my-book/. /// -public class S3TextStore(IOptions options, IAmazonS3 s3) : ITextStore, IDisposable +public class S3TextStore(IOptions options, IAmazonS3 s3, ILogger logger) : ITextStore, IDisposable { private const string TextFileName = "text.bin"; private const string AutoCompleteFileName = "autocomplete.bin"; @@ -36,13 +37,15 @@ public async Task SaveText(string key, Text text) using var stream = new MemoryStream(); Serializer.Serialize(stream, text); await PutObjectAsync(GetS3Key(key, TextFileName), stream, "application/octet-stream"); + logger.LogDebug("Saved {Type} for '{Key}'", TextFileName, key); } /// public async Task LoadText(string key) { - var stream = await GetObjectStreamAsync(GetS3Key(key, TextFileName)); + var stream = await GetObjectStreamAsync(GetS3Key(key, TextFileName), key); if (stream == null) return null; + logger.LogDebug("Loaded {Type} for '{Key}'", TextFileName, key); using (stream) return Serializer.Deserialize(stream); } @@ -52,13 +55,15 @@ public async Task SaveAutoComplete(string key, AutoComplete autoComplete) using var stream = new MemoryStream(); Serializer.Serialize(stream, autoComplete); await PutObjectAsync(GetS3Key(key, AutoCompleteFileName), stream, "application/octet-stream"); + logger.LogDebug("Saved {Type} for '{Key}'", AutoCompleteFileName, key); } /// public async Task LoadAutoComplete(string key) { - var stream = await GetObjectStreamAsync(GetS3Key(key, AutoCompleteFileName)); + var stream = await GetObjectStreamAsync(GetS3Key(key, AutoCompleteFileName), key); if (stream == null) return null; + logger.LogDebug("Loaded {Type} for '{Key}'", AutoCompleteFileName, key); using (stream) return Serializer.Deserialize(stream); } @@ -67,13 +72,15 @@ public async Task SaveManifest(string key, string json) { using var stream = new MemoryStream(System.Text.Encoding.UTF8.GetBytes(json)); await PutObjectAsync(GetS3Key(key, ManifestFileName), stream, "application/json"); + logger.LogDebug("Saved {Type} for '{Key}'", ManifestFileName, key); } /// public async Task LoadManifest(string key) { - var stream = await GetObjectStreamAsync(GetS3Key(key, ManifestFileName)); + var stream = await GetObjectStreamAsync(GetS3Key(key, ManifestFileName), key); if (stream == null) return null; + logger.LogDebug("Loaded {Type} for '{Key}'", ManifestFileName, key); using var reader = new StreamReader(stream); return await reader.ReadToEndAsync(); } @@ -83,13 +90,15 @@ public async Task SaveRawText(string key, string rawText) { using var stream = new MemoryStream(System.Text.Encoding.UTF8.GetBytes(rawText)); await PutObjectAsync(GetS3Key(key, RawTextFileName), stream, "text/plain"); + logger.LogDebug("Saved {Type} for '{Key}'", RawTextFileName, key); } /// public async Task LoadRawText(string key) { - var stream = await GetObjectStreamAsync(GetS3Key(key, RawTextFileName)); + var stream = await GetObjectStreamAsync(GetS3Key(key, RawTextFileName), key); if (stream == null) return null; + logger.LogDebug("Loaded {Type} for '{Key}'", RawTextFileName, key); using var reader = new StreamReader(stream); return await reader.ReadToEndAsync(); } @@ -100,12 +109,14 @@ public async Task SavePdf(string key, Stream pdfStream) using var ms = new MemoryStream(); await pdfStream.CopyToAsync(ms); await PutObjectAsync(GetS3Key(key, PdfFileName), ms, "application/pdf"); + logger.LogDebug("Saved {Type} for '{Key}'", PdfFileName, key); } /// public async Task LoadPdf(string key) { - var stream = await GetObjectStreamAsync(GetS3Key(key, PdfFileName)); + var stream = await GetObjectStreamAsync(GetS3Key(key, PdfFileName), key); + if (stream != null) logger.LogDebug("Loaded {Type} for '{Key}'", PdfFileName, key); return stream; } @@ -114,13 +125,15 @@ public async Task SaveFigures(string key, string json) { using var stream = new MemoryStream(System.Text.Encoding.UTF8.GetBytes(json)); await PutObjectAsync(GetS3Key(key, FiguresFileName), stream, "application/json"); + logger.LogDebug("Saved {Type} for '{Key}'", FiguresFileName, key); } /// public async Task LoadFigures(string key) { - var stream = await GetObjectStreamAsync(GetS3Key(key, FiguresFileName)); + var stream = await GetObjectStreamAsync(GetS3Key(key, FiguresFileName), key); if (stream == null) return null; + logger.LogDebug("Loaded {Type} for '{Key}'", FiguresFileName, key); using var reader = new StreamReader(stream); return await reader.ReadToEndAsync(); } @@ -130,13 +143,15 @@ public async Task SaveAnnotations(string key, string json) { using var stream = new MemoryStream(System.Text.Encoding.UTF8.GetBytes(json)); await PutObjectAsync(GetS3Key(key, AnnotationsFileName), stream, "application/json"); + logger.LogDebug("Saved {Type} for '{Key}'", AnnotationsFileName, key); } /// public async Task LoadAnnotations(string key) { - var stream = await GetObjectStreamAsync(GetS3Key(key, AnnotationsFileName)); + var stream = await GetObjectStreamAsync(GetS3Key(key, AnnotationsFileName), key); if (stream == null) return null; + logger.LogDebug("Loaded {Type} for '{Key}'", AnnotationsFileName, key); using var reader = new StreamReader(stream); return await reader.ReadToEndAsync(); } @@ -147,10 +162,12 @@ public async Task Exists(string key) try { await s3.GetObjectMetadataAsync(_bucket, GetS3Key(key, TextFileName)); + logger.LogDebug("Artefact exists for '{Key}'", key); return true; } catch (AmazonS3Exception ex) when (ex.StatusCode == System.Net.HttpStatusCode.NotFound) { + logger.LogDebug("Artefact not found for '{Key}'", key); return false; } } @@ -160,13 +177,15 @@ public async Task SaveCapabilities(string key, int services) { using var stream = new MemoryStream(System.Text.Encoding.UTF8.GetBytes(services.ToString())); await PutObjectAsync(GetS3Key(key, CapabilitiesFileName), stream, "text/plain"); + logger.LogDebug("Saved {Type} for '{Key}'", CapabilitiesFileName, key); } /// public async Task LoadCapabilities(string key) { - var stream = await GetObjectStreamAsync(GetS3Key(key, CapabilitiesFileName)); + var stream = await GetObjectStreamAsync(GetS3Key(key, CapabilitiesFileName), key); if (stream == null) return null; + logger.LogDebug("Loaded {Type} for '{Key}'", CapabilitiesFileName, key); using var reader = new StreamReader(stream); var text = await reader.ReadToEndAsync(); return int.TryParse(text.Trim(), out var value) ? value : null; @@ -177,13 +196,15 @@ public async Task SavePageSequence(string key, string json) { using var stream = new MemoryStream(System.Text.Encoding.UTF8.GetBytes(json)); await PutObjectAsync(GetS3Key(key, PageSequenceFileName), stream, "application/json"); + logger.LogDebug("Saved {Type} for '{Key}'", PageSequenceFileName, key); } /// public async Task LoadPageSequence(string key) { - var stream = await GetObjectStreamAsync(GetS3Key(key, PageSequenceFileName)); + var stream = await GetObjectStreamAsync(GetS3Key(key, PageSequenceFileName), key); if (stream == null) return null; + logger.LogDebug("Loaded {Type} for '{Key}'", PageSequenceFileName, key); using var reader = new StreamReader(stream); return await reader.ReadToEndAsync(); } @@ -208,6 +229,7 @@ public async Task DeleteArtefacts(string key) // Idempotent — object not found is not an error } } + logger.LogDebug("Deleted artefacts for '{Key}'", key); } public void Dispose() => s3.Dispose(); @@ -231,7 +253,7 @@ await s3.PutObjectAsync(new PutObjectRequest }); } - private async Task GetObjectStreamAsync(string s3Key) + private async Task GetObjectStreamAsync(string s3Key, string key) { try { @@ -240,7 +262,13 @@ await s3.PutObjectAsync(new PutObjectRequest } catch (AmazonS3Exception ex) when (ex.StatusCode == System.Net.HttpStatusCode.NotFound) { + logger.LogDebug("{S3Key} not found for '{Key}'", s3Key, key); return null; } + catch (AmazonS3Exception ex) + { + logger.LogWarning(ex, "Unexpected S3 error loading '{S3Key}'", s3Key); + throw; + } } } diff --git a/src/TextServices.Tests.E2E/Infrastructure/BuilderApiFactory.cs b/src/TextServices.Tests.E2E/Infrastructure/BuilderApiFactory.cs index 43f6473..524e4b9 100644 --- a/src/TextServices.Tests.E2E/Infrastructure/BuilderApiFactory.cs +++ b/src/TextServices.Tests.E2E/Infrastructure/BuilderApiFactory.cs @@ -4,6 +4,7 @@ using Microsoft.AspNetCore.TestHost; using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; using TextServices.Builder.Api.Data; using TextServices.Builder.Api.Services; using TextServices.Storage; @@ -48,7 +49,9 @@ protected override void ConfigureWebHost(IWebHostBuilder builder) // ---- Storage: use shared temp directory ----------------------------- services.AddSingleton(_ => - new FileSystemTextStore(new FileSystemTextStoreOptions { RootPath = StorageRoot })); + new FileSystemTextStore( + new FileSystemTextStoreOptions { RootPath = StorageRoot }, + NullLogger.Instance)); // ---- Fetchers: replace with fixture-based stubs --------------------- services.AddScoped(_ => diff --git a/src/TextServices.Tests.E2E/Infrastructure/SearchApiFactory.cs b/src/TextServices.Tests.E2E/Infrastructure/SearchApiFactory.cs index 52c2ad3..0e51021 100644 --- a/src/TextServices.Tests.E2E/Infrastructure/SearchApiFactory.cs +++ b/src/TextServices.Tests.E2E/Infrastructure/SearchApiFactory.cs @@ -3,6 +3,7 @@ using Microsoft.AspNetCore.TestHost; using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; using TextServices.Search.Api.Services; using TextServices.Storage; @@ -29,6 +30,8 @@ protected override void ConfigureWebHost(IWebHostBuilder builder) builder.ConfigureTestServices(services => services.AddSingleton(_ => - new FileSystemTextStore(new FileSystemTextStoreOptions { RootPath = storageRoot }))); + new FileSystemTextStore( + new FileSystemTextStoreOptions { RootPath = storageRoot }, + NullLogger.Instance))); } } diff --git a/src/TextServices.Tests/BuilderApi/TextBuildJobTests.cs b/src/TextServices.Tests/BuilderApi/TextBuildJobTests.cs index 82cd857..6463f40 100644 --- a/src/TextServices.Tests/BuilderApi/TextBuildJobTests.cs +++ b/src/TextServices.Tests/BuilderApi/TextBuildJobTests.cs @@ -36,7 +36,8 @@ public TextBuildJobTests() _db = new BuilderDbContext(options); _textStore = new FileSystemTextStore( - new FileSystemTextStoreOptions { RootPath = _tempDir }); + new FileSystemTextStoreOptions { RootPath = _tempDir }, + NullLogger.Instance); } public void Dispose() @@ -653,6 +654,7 @@ private TextBuildJob MakeJob( _textStore, jobNotifier ?? new NoOpJobNotifier(), Options.Create(new TextServicesOptions()), + NullLoggerFactory.Instance, NullLogger.Instance); } diff --git a/src/TextServices.Tests/SearchApi/SearchHandlerTests.cs b/src/TextServices.Tests/SearchApi/SearchHandlerTests.cs index 589b0ec..d616100 100644 --- a/src/TextServices.Tests/SearchApi/SearchHandlerTests.cs +++ b/src/TextServices.Tests/SearchApi/SearchHandlerTests.cs @@ -1,4 +1,5 @@ using System.Xml.Linq; +using Microsoft.Extensions.Logging.Abstractions; using Shouldly; using TextServices.Core.Models; using TextServices.Core.Providers; @@ -24,7 +25,7 @@ public class SearchHandlerTests public async Task Handle_EmptyQuery_ReturnsEmptyResponse() { var text = BuildText([("https://example.org/c/1", 1000, 1500, "hello world")]); - var handler = new SearchHandler(new StubTextCache(text)); + var handler = new SearchHandler(new StubTextCache(text), new NullLogger()); var result = await handler.Handle( new SearchRequest("test/book", "", SelfUrl, SelfUrl), CancellationToken.None); @@ -39,7 +40,7 @@ public async Task Handle_EmptyQuery_ReturnsEmptyResponse() public async Task Handle_QueryNotFound_ReturnsEmptyResponse() { var text = BuildText([("https://example.org/c/1", 1000, 1500, "hello world")]); - var handler = new SearchHandler(new StubTextCache(text)); + var handler = new SearchHandler(new StubTextCache(text), new NullLogger()); var result = await handler.Handle( new SearchRequest("test/book", "parliament", SelfUrl, SelfUrl), CancellationToken.None); @@ -52,7 +53,7 @@ public async Task Handle_QueryNotFound_ReturnsEmptyResponse() [Fact] public async Task Handle_TextNotFound_ReturnsNull() { - var handler = new SearchHandler(new StubTextCache(null)); + var handler = new SearchHandler(new StubTextCache(null), new NullLogger()); var result = await handler.Handle( new SearchRequest("missing/book", "hello", SelfUrl, SelfUrl), CancellationToken.None); @@ -69,7 +70,7 @@ public async Task Handle_SingleHit_CorrectAnnotationShape() { var canvasId = "https://example.org/c/1"; var text = BuildText([(canvasId, 1000, 1500, "the quick brown fox")]); - var handler = new SearchHandler(new StubTextCache(text)); + var handler = new SearchHandler(new StubTextCache(text), new NullLogger()); var result = await handler.Handle( new SearchRequest("test/book", "quick", SelfUrl, SelfUrl), CancellationToken.None); @@ -91,7 +92,7 @@ public async Task Handle_SingleHit_CorrectAnnotationShape() public async Task Handle_SingleHit_CorrectHitShape() { var text = BuildText([("https://example.org/c/1", 1000, 1500, "the quick brown fox")]); - var handler = new SearchHandler(new StubTextCache(text)); + var handler = new SearchHandler(new StubTextCache(text), new NullLogger()); var result = await handler.Handle( new SearchRequest("test/book", "quick", SelfUrl, SelfUrl), CancellationToken.None); @@ -112,7 +113,7 @@ public async Task Handle_SingleHit_CorrectHitShape() public async Task Handle_ResponseHasCorrectId() { var text = BuildText([("https://example.org/c/1", 1000, 1500, "hello world")]); - var handler = new SearchHandler(new StubTextCache(text)); + var handler = new SearchHandler(new StubTextCache(text), new NullLogger()); var result = await handler.Handle( new SearchRequest("test/book", "hello", SelfUrl, SelfUrl), CancellationToken.None); @@ -127,7 +128,7 @@ public async Task Handle_MultipleHits_CorrectCount() { // "the" appears twice in this text var text = BuildText([("https://example.org/c/1", 1000, 1500, "the quick brown the fox")]); - var handler = new SearchHandler(new StubTextCache(text)); + var handler = new SearchHandler(new StubTextCache(text), new NullLogger()); var result = await handler.Handle( new SearchRequest("test/book", "the", SelfUrl, SelfUrl), CancellationToken.None); @@ -142,7 +143,7 @@ public async Task Handle_WithQueryInSelfUrl_AnnotationIdDoesNotContainQueryStrin { var selfUrlWithQuery = $"{SelfUrl}?q=quick"; var text = BuildText([("https://example.org/c/1", 1000, 1500, "the quick brown fox")]); - var handler = new SearchHandler(new StubTextCache(text)); + var handler = new SearchHandler(new StubTextCache(text), new NullLogger()); var result = await handler.Handle( new SearchRequest("test/book", "quick", selfUrlWithQuery, SelfUrl), CancellationToken.None); @@ -163,7 +164,7 @@ public async Task Handle_MultiPage_AnnotationsReferenceCorrectCanvas() (canvas1, 1000, 1500, "hello world"), (canvas2, 1000, 1500, "parliament square"), ]); - var handler = new SearchHandler(new StubTextCache(text)); + var handler = new SearchHandler(new StubTextCache(text), new NullLogger()); var result = await handler.Handle( new SearchRequest("test/book", "parliament", SelfUrl, SelfUrl), CancellationToken.None); diff --git a/src/TextServices.Tests/Storage/FileSystemTextStoreTests.cs b/src/TextServices.Tests/Storage/FileSystemTextStoreTests.cs index 228fbf5..737b347 100644 --- a/src/TextServices.Tests/Storage/FileSystemTextStoreTests.cs +++ b/src/TextServices.Tests/Storage/FileSystemTextStoreTests.cs @@ -1,3 +1,4 @@ +using Microsoft.Extensions.Logging.Abstractions; using Shouldly; using TextServices.Core.Models; using TextServices.Storage; @@ -16,7 +17,7 @@ public sealed class FileSystemTextStoreTests : IDisposable public FileSystemTextStoreTests() { _tempDir = Path.Combine(Path.GetTempPath(), $"TextServicesTests_{Guid.NewGuid():N}"); - _store = new FileSystemTextStore(new FileSystemTextStoreOptions { RootPath = _tempDir }); + _store = new FileSystemTextStore(new FileSystemTextStoreOptions { RootPath = _tempDir }, NullLogger.Instance); } public void Dispose() diff --git a/src/TextServices.Tests/Storage/S3TextStoreTests.cs b/src/TextServices.Tests/Storage/S3TextStoreTests.cs index 587ff7f..1b05a3d 100644 --- a/src/TextServices.Tests/Storage/S3TextStoreTests.cs +++ b/src/TextServices.Tests/Storage/S3TextStoreTests.cs @@ -1,6 +1,7 @@ using System.Net; using Amazon.S3; using Amazon.S3.Model; +using Microsoft.Extensions.Logging.Abstractions; using Microsoft.Extensions.Options; using Shouldly; using TextServices.Core.Models; @@ -180,7 +181,8 @@ public async Task NoKeyPrefix_KeyStartsWithJobId() // ------------------------------------------------------------------------- private static S3TextStore MakeStore(FakeS3 fake, string prefix = "") - => new(Options.Create(new S3TextStoreOptions { BucketName = Bucket, KeyPrefix = prefix }), fake); + => new(Options.Create(new S3TextStoreOptions { BucketName = Bucket, KeyPrefix = prefix }), fake, + NullLogger.Instance); private static Text BuildText(string words) { From af7e02702c18e061ca8eb07c88d466ba03790dcc Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Mon, 1 Jun 2026 12:09:10 +0100 Subject: [PATCH 05/21] Validate job ID must not start with '/' * Reject IDs with a leading slash at creation time; downstream endpoints return 404 naturally since no such job can exist * Use hasData (non-null, non-empty) consistently for the per-page iteration guard, replacing the looser SourceData != null check * Add unit tests covering all JobInstruction validation branches * Add E2E tests confirming 400 on leading slash and 202 on internal slashes Co-Authored-By: Claude Sonnet 4.6 --- .../Features/Jobs/JobInstruction.cs | 11 +- .../BuildAndSearchTests.cs | 24 +++ .../BuilderApi/JobInstructionTests.cs | 189 ++++++++++++++++++ 3 files changed, 222 insertions(+), 2 deletions(-) create mode 100644 src/TextServices.Tests/BuilderApi/JobInstructionTests.cs diff --git a/src/TextServices.Builder.Api/Features/Jobs/JobInstruction.cs b/src/TextServices.Builder.Api/Features/Jobs/JobInstruction.cs index 2497e5c..8768bb5 100644 --- a/src/TextServices.Builder.Api/Features/Jobs/JobInstruction.cs +++ b/src/TextServices.Builder.Api/Features/Jobs/JobInstruction.cs @@ -46,6 +46,13 @@ public class JobInstruction : IValidatableObject public IEnumerable Validate(ValidationContext validationContext) { + if (Id.StartsWith('/')) + { + yield return new ValidationResult( + "Job ID must not start with '/'.", + [nameof(Id)]); + } + var hasUri = !string.IsNullOrWhiteSpace(SourceUri); var hasData = SourceData is { Count: > 0 }; @@ -63,9 +70,9 @@ public IEnumerable Validate(ValidationContext validationContex [nameof(SourceUri), nameof(SourceData)]); } - if (SourceData != null) + if (hasData) { - foreach (var page in SourceData) + foreach (var page in SourceData!) { if (!string.Equals(page.Type, "pdf", StringComparison.OrdinalIgnoreCase) && string.IsNullOrEmpty(page.Id)) diff --git a/src/TextServices.Tests.E2E/BuildAndSearchTests.cs b/src/TextServices.Tests.E2E/BuildAndSearchTests.cs index ef46358..2282c5b 100644 --- a/src/TextServices.Tests.E2E/BuildAndSearchTests.cs +++ b/src/TextServices.Tests.E2E/BuildAndSearchTests.cs @@ -37,6 +37,30 @@ public async Task PostJob_Returns202WithLocation() response.Headers.Location!.ToString().ShouldContain("e2e/lifecycle-test"); } + [Fact] + public async Task PostJob_IdStartingWithSlash_Returns400() + { + var response = await ctx.BuilderClient.PostAsJsonAsync("/textbuilder", new + { + id = "/bad-id", + sourceUri = "https://iiif.wellcomecollection.org/presentation/b2888193x" + }); + + response.StatusCode.ShouldBe(HttpStatusCode.BadRequest); + } + + [Fact] + public async Task PostJob_IdWithInternalSlashes_Returns202() + { + var response = await ctx.BuilderClient.PostAsJsonAsync("/textbuilder", new + { + id = "e2e/slash/in/id", + sourceUri = "https://iiif.wellcomecollection.org/presentation/b2888193x" + }); + + response.StatusCode.ShouldBe(HttpStatusCode.Accepted); + } + [Fact] public async Task PostJob_DuplicateId_Returns409() { diff --git a/src/TextServices.Tests/BuilderApi/JobInstructionTests.cs b/src/TextServices.Tests/BuilderApi/JobInstructionTests.cs new file mode 100644 index 0000000..e3d1161 --- /dev/null +++ b/src/TextServices.Tests/BuilderApi/JobInstructionTests.cs @@ -0,0 +1,189 @@ +using System.ComponentModel.DataAnnotations; +using Shouldly; +using TextServices.Builder.Api.Features.Jobs; + +namespace TextServices.Tests.BuilderApi; + +public class JobInstructionTests +{ + private static List Validate(JobInstruction instruction) + { + var results = new List(); + Validator.TryValidateObject(instruction, new ValidationContext(instruction), results, validateAllProperties: true); + return results; + } + + private static PageInstruction Page(string? id = "https://example.org/canvas/1", string? type = null) => + new() { Id = id, Type = type }; + + // ------------------------------------------------------------------------- + // Id — leading slash + // ------------------------------------------------------------------------- + + [Fact] + public void Id_StartingWithSlash_FailsValidation() + { + var results = Validate(new JobInstruction + { + Id = "/bad-id", + SourceUri = "https://example.org/manifest", + }); + + results.ShouldContain(r => + r.MemberNames.Contains(nameof(JobInstruction.Id)) && + r.ErrorMessage == "Job ID must not start with '/'."); + } + + [Theory] + [InlineData("my-job")] + [InlineData("2/books/my-book")] + [InlineData("customer/collection/item")] + public void Id_NotStartingWithSlash_NoIdError(string id) + { + var results = Validate(new JobInstruction + { + Id = id, + SourceUri = "https://example.org/manifest", + }); + + results.ShouldNotContain(r => r.MemberNames.Contains(nameof(JobInstruction.Id))); + } + + // ------------------------------------------------------------------------- + // Source — neither / both / one + // ------------------------------------------------------------------------- + + [Fact] + public void NeitherSourceUriNorSourceData_FailsValidation() + { + var results = Validate(new JobInstruction { Id = "my-job" }); + + results.ShouldContain(r => + r.ErrorMessage == "Exactly one of sourceUri or sourceData must be provided."); + } + + [Fact] + public void BothSourceUriAndSourceData_FailsValidation() + { + var results = Validate(new JobInstruction + { + Id = "my-job", + SourceUri = "https://example.org/manifest", + SourceData = [Page()], + }); + + results.ShouldContain(r => + r.ErrorMessage == "Provide sourceUri or sourceData, not both."); + } + + [Fact] + public void OnlySourceUri_PassesSourceValidation() + { + var results = Validate(new JobInstruction + { + Id = "my-job", + SourceUri = "https://example.org/manifest", + }); + + results.ShouldNotContain(r => + r.MemberNames.Contains(nameof(JobInstruction.SourceUri)) || + r.MemberNames.Contains(nameof(JobInstruction.SourceData))); + } + + [Fact] + public void OnlySourceData_PassesSourceValidation() + { + var results = Validate(new JobInstruction + { + Id = "my-job", + SourceData = [Page()], + }); + + results.ShouldNotContain(r => + r.MemberNames.Contains(nameof(JobInstruction.SourceUri)) || + r.MemberNames.Contains(nameof(JobInstruction.SourceData))); + } + + [Fact] + public void EmptySourceDataList_TreatedAsNoSource() + { + // Count: 0 means hasData is false — same as omitting SourceData entirely. + var results = Validate(new JobInstruction + { + Id = "my-job", + SourceData = [], + }); + + results.ShouldContain(r => + r.ErrorMessage == "Exactly one of sourceUri or sourceData must be provided."); + } + + // ------------------------------------------------------------------------- + // SourceData — per-page id requirement + // ------------------------------------------------------------------------- + + [Fact] + public void SourceData_NonPdfPageWithoutId_FailsValidation() + { + var results = Validate(new JobInstruction + { + Id = "my-job", + SourceData = [Page(id: null)], + }); + + results.ShouldContain(r => + r.MemberNames.Contains(nameof(JobInstruction.SourceData)) && + r.ErrorMessage == "Each non-pdf page in sourceData must supply an 'id' (canvas identifier URI)."); + } + + [Fact] + public void SourceData_NonPdfPageWithId_PassesValidation() + { + var results = Validate(new JobInstruction + { + Id = "my-job", + SourceData = [Page(id: "https://example.org/canvas/1")], + }); + + results.ShouldNotContain(r => + r.ErrorMessage == "Each non-pdf page in sourceData must supply an 'id' (canvas identifier URI)."); + } + + [Theory] + [InlineData("pdf")] + [InlineData("PDF")] + [InlineData("Pdf")] + public void SourceData_PdfPageWithoutId_PassesValidation(string pdfType) + { + // pdf-type pages are exempt from the id requirement. + var results = Validate(new JobInstruction + { + Id = "my-job", + SourceData = [Page(id: null, type: pdfType)], + }); + + results.ShouldNotContain(r => + r.ErrorMessage == "Each non-pdf page in sourceData must supply an 'id' (canvas identifier URI)."); + } + + [Fact] + public void SourceData_MixedPages_OnlyPagesMissingIdFail() + { + var results = Validate(new JobInstruction + { + Id = "my-job", + SourceData = + [ + Page(id: "https://example.org/canvas/1"), + Page(id: null), // should fail + Page(id: null, type: "pdf"), // exempt + Page(id: "https://example.org/canvas/3"), + ], + }); + + // Exactly one error for the page missing its id. + results.Count(r => + r.ErrorMessage == "Each non-pdf page in sourceData must supply an 'id' (canvas identifier URI).") + .ShouldBe(1); + } +} From f39bb6afa660dace79e8ae5184d30fccd12751d1 Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Mon, 1 Jun 2026 12:26:43 +0100 Subject: [PATCH 06/21] Add example appsettings --- .gitignore | 2 ++ .../appsettings.Development-Example.json | 11 +++++++++++ src/TextServices.Builder.Api/appsettings.json | 12 +----------- .../appsettings.Development-Example.json | 11 +++++++++++ src/TextServices.Search.Api/appsettings.json | 5 +---- 5 files changed, 26 insertions(+), 15 deletions(-) create mode 100644 src/TextServices.Builder.Api/appsettings.Development-Example.json create mode 100644 src/TextServices.Search.Api/appsettings.Development-Example.json diff --git a/.gitignore b/.gitignore index 920616f..9d48280 100644 --- a/.gitignore +++ b/.gitignore @@ -436,3 +436,5 @@ FodyWeavers.xsd *.msix *.msm *.msp + +appsettings.Development.json \ No newline at end of file diff --git a/src/TextServices.Builder.Api/appsettings.Development-Example.json b/src/TextServices.Builder.Api/appsettings.Development-Example.json new file mode 100644 index 0000000..7a0bc49 --- /dev/null +++ b/src/TextServices.Builder.Api/appsettings.Development-Example.json @@ -0,0 +1,11 @@ +{ + "RunMigrations": true, + "ConnectionStrings": { + "BuilderDb": "Server=127.0.0.1;Port=5452;Database=postgres;User Id=postgres;Password=txt_password;IncludeErrorDetail=True" + }, + "CorsAllowedOrigins": [ "http://localhost:5100" ], + "TextServices": { + "SearchApiBaseUrl": "http://localhost:5294", + "AllowFileImageProxy": true + } +} diff --git a/src/TextServices.Builder.Api/appsettings.json b/src/TextServices.Builder.Api/appsettings.json index 05b7fa6..270431d 100644 --- a/src/TextServices.Builder.Api/appsettings.json +++ b/src/TextServices.Builder.Api/appsettings.json @@ -19,15 +19,5 @@ ] }, "AllowedHosts": "*", - "RunMigrations": false, - "CorsAllowedOrigins": [ "http://localhost:5100" ], - "ConnectionStrings": { - "BuilderDb": "" - }, - "TextServices": { - "SearchApiBaseUrl": "", - "Storage": { - "RootPath": "C:/textservices-data" - } - } + "RunMigrations": false } diff --git a/src/TextServices.Search.Api/appsettings.Development-Example.json b/src/TextServices.Search.Api/appsettings.Development-Example.json new file mode 100644 index 0000000..d3d168c --- /dev/null +++ b/src/TextServices.Search.Api/appsettings.Development-Example.json @@ -0,0 +1,11 @@ +{ + "TextServices": { + "BaseUrl": "http://localhost:5294", + "AllowFileImageProxy": true, + "CacheSlidingExpirationMinutes": 30, + "CacheAbsoluteExpirationHours": 4, + "CacheMaxEntries": 20, + "StorageRootPath": "C:/textservices-data" + }, + "CorsAllowedOrigins": [ "http://localhost:5100" ] +} diff --git a/src/TextServices.Search.Api/appsettings.json b/src/TextServices.Search.Api/appsettings.json index fbc7a4d..de9217e 100644 --- a/src/TextServices.Search.Api/appsettings.json +++ b/src/TextServices.Search.Api/appsettings.json @@ -18,12 +18,9 @@ ] }, "AllowedHosts": "*", - "CorsAllowedOrigins": [ "http://localhost:5100" ], "TextServices": { - "BaseUrl": "", "CacheSlidingExpirationMinutes": 30, "CacheAbsoluteExpirationHours": 4, - "CacheMaxEntries": 20, - "StorageRootPath": "C:/textservices-data" + "CacheMaxEntries": 20 } } From 79ca2403f9acb0e721737dbc76cd810341c51ebf Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Mon, 1 Jun 2026 12:27:07 +0100 Subject: [PATCH 07/21] Exclude dev appsettings from gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 9d48280..8596f40 100644 --- a/.gitignore +++ b/.gitignore @@ -437,4 +437,5 @@ FodyWeavers.xsd *.msm *.msp -appsettings.Development.json \ No newline at end of file +appsettings.Development.json +appsettings.Development* \ No newline at end of file From 90354f45be9620ca9f3d9ebafe5ef9230b9fd78b Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Mon, 1 Jun 2026 14:32:42 +0100 Subject: [PATCH 08/21] Add S3 support to Search API, align storage config structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Search API previously hardcoded FileSystemTextStore — any deployment using S3 in the Builder API would write artefacts the Search API could not read. * Add AWSSDK.Extensions.NETCore.Setup to Search API; wire S3TextStore with the same conditional pattern as the Builder API * Rename Storage:RootPath -> Storage:FileSystem:RootPath in both APIs so filesystem and S3 are symmetrically namespaced under Storage: * Add typed S3StorageOptions to Builder API's StorageOptions (was previously read only as raw config strings) * Add missing settings to builder-api.md: RunMigrations, AllowFileImageProxy, Notifications:TopicArn * Use full config key paths (TextServices:...) in both doc tables to make nesting unambiguous * Remove dead CorsAllowedOrigins from Search API example appsettings Co-Authored-By: Claude Sonnet 4.6 --- docs/builder-api.md | 24 ++++++++++----- docs/search-api.md | 26 +++++++++------- .../ServiceCollectionExtensions.cs | 2 +- .../Configuration/TextServicesOptions.cs | 18 +++++++++++ .../Configuration/SearchApiOptions.cs | 30 +++++++++++++++++-- src/TextServices.Search.Api/Program.cs | 25 ++++++++++++---- .../TextServices.Search.Api.csproj | 1 + .../appsettings.Development-Example.json | 12 ++++---- .../Infrastructure/SearchApiFactory.cs | 2 +- 9 files changed, 107 insertions(+), 33 deletions(-) diff --git a/docs/builder-api.md b/docs/builder-api.md index d2ab68a..b30e4a6 100644 --- a/docs/builder-api.md +++ b/docs/builder-api.md @@ -385,14 +385,21 @@ Builder API configuration lives under the `TextServices` key in `appsettings.jso ```json { + "RunMigrations": true, "ConnectionStrings": { "BuilderDb": "Host=localhost;Database=textservices;Username=...;Password=..." }, "TextServices": { "SearchApiBaseUrl": "https://search.example.org", + "AllowFileImageProxy": false, "MaxConcurrentPageFetches": 8, "Storage": { - "RootPath": "/data/textservices" + "FileSystem": { + "RootPath": "/data/textservices" + } + }, + "Notifications": { + "TopicArn": "" } }, "CorsAllowedOrigins": ["https://viewer.example.org"] @@ -401,13 +408,16 @@ Builder API configuration lives under the `TextServices` key in `appsettings.jso | Setting | Default | Description | |---|---|---| +| `RunMigrations` | `false` | When `true`, applies any pending EF Core database migrations on startup. Convenient for dev and simple deployments; leave `false` and run migrations separately in production. | | `ConnectionStrings:BuilderDb` | _(required)_ | PostgreSQL connection string. Used for both EF Core (job state) and Hangfire (job queue). | -| `SearchApiBaseUrl` | `""` | Public base URL of the Search API. Used to populate the `searchV1` / `searchV2` fields in completed job responses, and to construct `/proxy/image` URLs in synthesised Manifests when `sourceData` pages supply `file://` or `s3://` imageUri values. Leave empty if the Search API is not yet deployed. | -| `MaxConcurrentPageFetches` | `8` | Maximum number of text files fetched in parallel within a single job. Increase for internal or S3 sources; keep low (4–8) for third-party HTTP hosts. | -| `ReportBatchProgress` | `true` | When `true`, `PagesCompleted` is flushed to the database every 10 pages during processing, allowing `GET /textbuilder/{id}` to reflect live progress. Set to `false` to reduce database writes on large manifests; the final count is still persisted when the job completes. | -| `Storage:RootPath` | `textservices-data` | Root directory for stored text artefacts. Ignored when S3 storage is configured. Must be readable by the Search API if used. | -| `Storage:S3:BucketName` | `""` | S3 bucket for stored artefacts. When set, the S3 store is used instead of the filesystem store. | -| `Storage:S3:KeyPrefix` | `""` | Optional prefix for all S3 object keys (e.g. `"textservices/"`). A trailing `/` is added automatically if omitted. | | `CorsAllowedOrigins` | `[]` | Allowed CORS origins. Empty array disables CORS. | +| `TextServices:SearchApiBaseUrl` | `""` | Public base URL of the Search API. Used to populate the `searchV1` / `searchV2` fields in completed job responses, and to construct `/proxy/image` URLs in synthesised Manifests when `sourceData` pages supply `file://` or `s3://` imageUri values. Leave empty if the Search API is not yet deployed. | +| `TextServices:AllowFileImageProxy` | `false` | When `true`, synthesised Manifests embed `/proxy/image` proxy URLs for `file://` and `s3://` imageUri values instead of omitting the painting annotation. Only enable in trusted local-dev environments. The Search API's `AllowFileImageProxy` must also be `true` for those proxy URLs to serve real content. | +| `TextServices:MaxConcurrentPageFetches` | `8` | Maximum number of text files fetched in parallel within a single job. Increase for internal or S3 sources; keep low (4–8) for third-party HTTP hosts. | +| `TextServices:ReportBatchProgress` | `true` | When `true`, `PagesCompleted` is flushed to the database every 10 pages during processing, allowing `GET /textbuilder/{id}` to reflect live progress. Set to `false` to reduce database writes on large manifests; the final count is still persisted when the job completes. | +| `TextServices:Storage:FileSystem:RootPath` | `textservices-data` | Root directory for stored text artefacts. Ignored when S3 storage is configured. Must be readable by the Search API if used. | +| `TextServices:Storage:S3:BucketName` | `""` | S3 bucket for stored artefacts. When set, the S3 store is used instead of the filesystem store. | +| `TextServices:Storage:S3:KeyPrefix` | `""` | Optional prefix for all S3 object keys (e.g. `"textservices/"`). A trailing `/` is added automatically if omitted. | +| `TextServices:Notifications:TopicArn` | `""` | ARN of an SNS topic to publish a notification to when a job completes (success or failure). Leave empty to disable notifications. | S3 credentials and region are resolved by the standard AWS SDK credential chain (environment variables, instance profile, `appsettings.json` `AWS` section). Configure the region via the `AWS:Region` key or the `AWS_DEFAULT_REGION` environment variable. diff --git a/docs/search-api.md b/docs/search-api.md index cbbeb94..966318a 100644 --- a/docs/search-api.md +++ b/docs/search-api.md @@ -501,7 +501,11 @@ Search API configuration lives under the `TextServices` key in `appsettings.json "CacheSlidingExpirationMinutes": 30, "CacheAbsoluteExpirationHours": 4, "CacheMaxEntries": 20, - "StorageRootPath": "/data/textservices", + "Storage": { + "FileSystem": { + "RootPath": "/data/textservices" + } + }, "PdfTriggerQueueCapacity": 50, "PdfTriggerMaxConcurrency": 2 } @@ -510,15 +514,17 @@ Search API configuration lives under the `TextServices` key in `appsettings.json | Setting | Default | Description | |---|---|---| -| `BaseUrl` | `""` | Public base URL of this API. Required when running behind a reverse proxy; without it, self-referencing URLs in responses will use the incoming `Host` header, which may be internal. | -| `CacheSlidingExpirationMinutes` | `30` | How long a text object stays in the memory cache after its last access. | -| `CacheAbsoluteExpirationHours` | `4` | Hard upper limit on cache lifetime, regardless of access frequency. Prevents large objects from living in the LOH indefinitely. | -| `CacheMaxEntries` | `20` | Maximum number of Text (and AutoComplete) objects held in memory simultaneously. Each object counts as one slot; LRU eviction applies when the limit is reached. Budget approximately 30–40 MB per large text when sizing container memory. | -| `StorageRootPath` | `textservices-data` | Root directory of the text artefact store. Must point to the same location as the Builder API's `Storage:RootPath`. | -| `PdfTriggerQueueCapacity` | `50` | Maximum number of PDF trigger requests that can be queued for background generation. Requests beyond this limit receive `503 Service Unavailable`. | -| `PdfTriggerMaxConcurrency` | `2` | Maximum number of PDFs generated concurrently by the background trigger queue. Each in-flight generation buffers the full PDF in memory — keep this low on memory-constrained hosts. | -| `AllowFileImageProxy` | `false` | When `true`, the `/proxy/image` endpoint streams local `file://` images. Only enable in trusted local-dev environments where those files are not access-controlled. | -| `AllowedCustomHosts` | `[]` | Hostnames accepted from the `X-Forwarded-Host` request header (e.g. custom CloudFront distributions). See [Forwarded-header URL rewriting](#forwarded-header-url-rewriting) below. | +| `TextServices:BaseUrl` | `""` | Public base URL of this API. Required when running behind a reverse proxy; without it, self-referencing URLs in responses will use the incoming `Host` header, which may be internal. | +| `TextServices:CacheSlidingExpirationMinutes` | `30` | How long a text object stays in the memory cache after its last access. | +| `TextServices:CacheAbsoluteExpirationHours` | `4` | Hard upper limit on cache lifetime, regardless of access frequency. Prevents large objects from living in the LOH indefinitely. | +| `TextServices:CacheMaxEntries` | `20` | Maximum number of Text (and AutoComplete) objects held in memory simultaneously. Each object counts as one slot; LRU eviction applies when the limit is reached. Budget approximately 30–40 MB per large text when sizing container memory. | +| `TextServices:Storage:FileSystem:RootPath` | `textservices-data` | Root directory of the text artefact store. Must match the Builder API's `TextServices:Storage:FileSystem:RootPath`. Ignored when S3 storage is configured. | +| `TextServices:Storage:S3:BucketName` | `""` | S3 bucket for stored artefacts. When set, the S3 store is used instead of the filesystem store. Must match the Builder API's `TextServices:Storage:S3:BucketName`. | +| `TextServices:Storage:S3:KeyPrefix` | `""` | Optional prefix for all S3 object keys (e.g. `"textservices/"`). A trailing `/` is added automatically if omitted. Must match the Builder API's `TextServices:Storage:S3:KeyPrefix`. | +| `TextServices:PdfTriggerQueueCapacity` | `50` | Maximum number of PDF trigger requests that can be queued for background generation. Requests beyond this limit receive `503 Service Unavailable`. | +| `TextServices:PdfTriggerMaxConcurrency` | `2` | Maximum number of PDFs generated concurrently by the background trigger queue. Each in-flight generation buffers the full PDF in memory — keep this low on memory-constrained hosts. | +| `TextServices:AllowFileImageProxy` | `false` | When `true`, the `/proxy/image` endpoint streams local `file://` images. Only enable in trusted local-dev environments where those files are not access-controlled. | +| `TextServices:AllowedCustomHosts` | `[]` | Hostnames accepted from the `X-Forwarded-Host` request header (e.g. custom CloudFront distributions). See [Forwarded-header URL rewriting](#forwarded-header-url-rewriting) below. | --- diff --git a/src/TextServices.Builder.Api/Configuration/ServiceCollectionExtensions.cs b/src/TextServices.Builder.Api/Configuration/ServiceCollectionExtensions.cs index 347ac02..75569cf 100644 --- a/src/TextServices.Builder.Api/Configuration/ServiceCollectionExtensions.cs +++ b/src/TextServices.Builder.Api/Configuration/ServiceCollectionExtensions.cs @@ -83,7 +83,7 @@ public static IServiceCollection AddTextStorage(this IServiceCollection services var storage = sp.GetRequiredService>().Value.Storage; return ActivatorUtilities.CreateInstance( sp, - new FileSystemTextStoreOptions { RootPath = storage.RootPath }); + new FileSystemTextStoreOptions { RootPath = storage.FileSystem.RootPath }); }); return services; diff --git a/src/TextServices.Builder.Api/Configuration/TextServicesOptions.cs b/src/TextServices.Builder.Api/Configuration/TextServicesOptions.cs index 1b08e15..ddc0b42 100644 --- a/src/TextServices.Builder.Api/Configuration/TextServicesOptions.cs +++ b/src/TextServices.Builder.Api/Configuration/TextServicesOptions.cs @@ -57,11 +57,29 @@ public class TextServicesOptions } public class StorageOptions +{ + /// Options for the filesystem text store. + public FileSystemStorageOptions FileSystem { get; set; } = new(); + + /// Options for S3 storage. When is set, S3 is used instead of the filesystem. + public S3StorageOptions S3 { get; set; } = new(); +} + +public class FileSystemStorageOptions { /// Root path under which text artefacts are stored on the filesystem. public string RootPath { get; set; } = "textservices-data"; } +public class S3StorageOptions +{ + /// S3 bucket for stored artefacts. + public string BucketName { get; set; } = string.Empty; + + /// Optional prefix for all S3 object keys (e.g. "textservices/"). A trailing / is added automatically if omitted. + public string KeyPrefix { get; set; } = string.Empty; +} + public class NotificationsOptions { /// diff --git a/src/TextServices.Search.Api/Configuration/SearchApiOptions.cs b/src/TextServices.Search.Api/Configuration/SearchApiOptions.cs index 1291721..e63f628 100644 --- a/src/TextServices.Search.Api/Configuration/SearchApiOptions.cs +++ b/src/TextServices.Search.Api/Configuration/SearchApiOptions.cs @@ -29,9 +29,6 @@ public class SearchApiOptions /// public int CacheMaxEntries { get; set; } = 20; - /// Root path of the filesystem text store (must match the Builder API's storage path). - public string StorageRootPath { get; set; } = "textservices-data"; - /// Maximum number of background PDF trigger requests to queue. Requests beyond this capacity return 503. public int PdfTriggerQueueCapacity { get; set; } = 50; @@ -65,4 +62,31 @@ public class SearchApiOptions /// X-Forwarded-Host is never honoured. /// public string[] AllowedCustomHosts { get; set; } = []; + + /// Options for the text artefact store. + public SearchStorageOptions Storage { get; set; } = new(); +} + +public class SearchStorageOptions +{ + /// Options for the filesystem text store. + public FileSystemStorageOptions FileSystem { get; set; } = new(); + + /// Options for S3 storage. When is set, S3 is used instead of the filesystem. + public S3StorageOptions S3 { get; set; } = new(); +} + +public class FileSystemStorageOptions +{ + /// Root directory for stored text artefacts. + public string RootPath { get; set; } = "textservices-data"; +} + +public class S3StorageOptions +{ + /// S3 bucket for stored artefacts. When set, the S3 store is used instead of the filesystem store. + public string BucketName { get; set; } = string.Empty; + + /// Optional prefix for all S3 object keys (e.g. "textservices/"). A trailing / is added automatically if omitted. + public string KeyPrefix { get; set; } = string.Empty; } diff --git a/src/TextServices.Search.Api/Program.cs b/src/TextServices.Search.Api/Program.cs index ab34191..19c4c22 100644 --- a/src/TextServices.Search.Api/Program.cs +++ b/src/TextServices.Search.Api/Program.cs @@ -1,4 +1,5 @@ using System.IO.Compression; +using Amazon.S3; using AsyncKeyedLock; using Microsoft.AspNetCore.ResponseCompression; using Microsoft.Extensions.Options; @@ -60,13 +61,27 @@ // ---- Storage ---------------------------------------------------------------- +builder.Services.AddDefaultAWSOptions(builder.Configuration.GetAWSOptions()); + +if (!string.IsNullOrEmpty(builder.Configuration["TextServices:Storage:S3:BucketName"])) + builder.Services.AddAWSService(); + builder.Services.AddSingleton(sp => - ActivatorUtilities.CreateInstance( - sp, - new FileSystemTextStoreOptions +{ + var opts = sp.GetRequiredService>().Value; + if (!string.IsNullOrEmpty(opts.Storage.S3.BucketName)) + { + var s3Opts = Options.Create(new S3TextStoreOptions { - RootPath = sp.GetRequiredService>().Value.StorageRootPath - })); + BucketName = opts.Storage.S3.BucketName, + KeyPrefix = opts.Storage.S3.KeyPrefix + }); + return ActivatorUtilities.CreateInstance(sp, s3Opts); + } + return ActivatorUtilities.CreateInstance( + sp, + new FileSystemTextStoreOptions { RootPath = opts.Storage.FileSystem.RootPath }); +}); // ITextStore is also injected directly into TextAugmentedHandler (manifest is plain JSON, // not routed through the Text/AutoComplete cache). diff --git a/src/TextServices.Search.Api/TextServices.Search.Api.csproj b/src/TextServices.Search.Api/TextServices.Search.Api.csproj index 3b99240..91bbebc 100644 --- a/src/TextServices.Search.Api/TextServices.Search.Api.csproj +++ b/src/TextServices.Search.Api/TextServices.Search.Api.csproj @@ -8,6 +8,7 @@ + diff --git a/src/TextServices.Search.Api/appsettings.Development-Example.json b/src/TextServices.Search.Api/appsettings.Development-Example.json index d3d168c..71d8f4a 100644 --- a/src/TextServices.Search.Api/appsettings.Development-Example.json +++ b/src/TextServices.Search.Api/appsettings.Development-Example.json @@ -2,10 +2,10 @@ "TextServices": { "BaseUrl": "http://localhost:5294", "AllowFileImageProxy": true, - "CacheSlidingExpirationMinutes": 30, - "CacheAbsoluteExpirationHours": 4, - "CacheMaxEntries": 20, - "StorageRootPath": "C:/textservices-data" - }, - "CorsAllowedOrigins": [ "http://localhost:5100" ] + "Storage": { + "FileSystem": { + "RootPath": "C:/textservices-data" + } + } + } } diff --git a/src/TextServices.Tests.E2E/Infrastructure/SearchApiFactory.cs b/src/TextServices.Tests.E2E/Infrastructure/SearchApiFactory.cs index 0e51021..8f632a9 100644 --- a/src/TextServices.Tests.E2E/Infrastructure/SearchApiFactory.cs +++ b/src/TextServices.Tests.E2E/Infrastructure/SearchApiFactory.cs @@ -24,7 +24,7 @@ protected override void ConfigureWebHost(IWebHostBuilder builder) builder.ConfigureAppConfiguration((_, config) => config.AddInMemoryCollection(new Dictionary { - ["TextServices:StorageRootPath"] = storageRoot, + ["TextServices:Storage:FileSystem:RootPath"] = storageRoot, ["TextServices:BaseUrl"] = "http://localhost" })); From f4acca900c010510e0ab417ac54a8936e2afd63b Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Mon, 1 Jun 2026 15:27:25 +0100 Subject: [PATCH 09/21] Centralise user-agent strings and add AWS startup logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add UserAgents.Builder / UserAgents.Search constants to TextServices.Infrastructure so both APIs reference one source * Builder was TextServices/1.0, Search had the GitHub URL suffix — now both are TextServices-{Role}/1.0 (+github URL) * Log at Debug when S3 storage or SNS notifications are activated, making startup config easier to diagnose Co-Authored-By: Claude Sonnet 4.6 --- .../Configuration/ServiceCollectionExtensions.cs | 9 ++++++++- .../Configuration/TextServicesOptions.cs | 3 +-- src/TextServices.Infrastructure/Http/UserAgents.cs | 9 +++++++++ .../Configuration/ServiceCollectionExtensions.cs | 3 ++- src/TextServices.Search.Api/Program.cs | 3 +++ 5 files changed, 23 insertions(+), 4 deletions(-) create mode 100644 src/TextServices.Infrastructure/Http/UserAgents.cs diff --git a/src/TextServices.Builder.Api/Configuration/ServiceCollectionExtensions.cs b/src/TextServices.Builder.Api/Configuration/ServiceCollectionExtensions.cs index 75569cf..d60bd14 100644 --- a/src/TextServices.Builder.Api/Configuration/ServiceCollectionExtensions.cs +++ b/src/TextServices.Builder.Api/Configuration/ServiceCollectionExtensions.cs @@ -5,8 +5,11 @@ using Hangfire; using Hangfire.PostgreSql; using Microsoft.Extensions.Options; +using Serilog; +using Serilog.Extensions.Logging; using TextServices.Builder.Api.Services; using TextServices.Builder.Api.Services.Notifications; +using TextServices.Infrastructure.Http; using TextServices.Storage; namespace TextServices.Builder.Api.Configuration; @@ -31,6 +34,8 @@ public static IServiceCollection AddHangfireServices(this IServiceCollection ser public static IServiceCollection AddAwsServices(this IServiceCollection services, IConfiguration configuration) { + var logger = new SerilogLoggerFactory(Log.Logger).CreateLogger(nameof(ServiceCollectionExtensions)); + services.AddDefaultAWSOptions(configuration.GetAWSOptions()); services.Configure(configuration.GetSection("TextServices:Storage:S3")); @@ -38,11 +43,13 @@ public static IServiceCollection AddAwsServices(this IServiceCollection services // chain at startup, which hangs in test environments without real credentials. if (!string.IsNullOrEmpty(configuration["TextServices:Storage:S3:BucketName"])) { + logger.LogDebug("Using S3 storage for text artefacts"); services.AddAWSService(); } if (!string.IsNullOrEmpty(configuration["TextServices:Notifications:TopicArn"])) { + logger.LogDebug("Configuring SNS notifications for job status changes"); services.AddAWSService(); } @@ -53,7 +60,7 @@ public static IServiceCollection AddFetchingServices(this IServiceCollection ser { services.AddHttpClient("Resource", client => { - client.DefaultRequestHeaders.UserAgent.ParseAdd("TextServices/1.0"); + client.DefaultRequestHeaders.UserAgent.ParseAdd(UserAgents.Builder); client.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/json")); client.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/ld+json", 0.9)); client.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("*/*", 0.8)); diff --git a/src/TextServices.Builder.Api/Configuration/TextServicesOptions.cs b/src/TextServices.Builder.Api/Configuration/TextServicesOptions.cs index ddc0b42..dcc05ee 100644 --- a/src/TextServices.Builder.Api/Configuration/TextServicesOptions.cs +++ b/src/TextServices.Builder.Api/Configuration/TextServicesOptions.cs @@ -9,7 +9,6 @@ public class TextServicesOptions /// public string SearchApiBaseUrl { get; set; } = string.Empty; - /// /// Allow the Search API's /proxy/image endpoint to serve file:// image URIs /// supplied in sourceData pages. @@ -26,7 +25,7 @@ public class TextServicesOptions /// referenced by imageUri are not access-controlled. /// /// - public bool AllowFileImageProxy { get; set; } = false; + public bool AllowFileImageProxy { get; set; } /// /// Maximum number of text files (ALTO, VTT, AnnotationPage) fetched concurrently within a single job. diff --git a/src/TextServices.Infrastructure/Http/UserAgents.cs b/src/TextServices.Infrastructure/Http/UserAgents.cs new file mode 100644 index 0000000..2f54846 --- /dev/null +++ b/src/TextServices.Infrastructure/Http/UserAgents.cs @@ -0,0 +1,9 @@ +namespace TextServices.Infrastructure.Http; + +public static class UserAgents +{ + private const string Suffix = " (+https://github.com/dlcs/text-services)"; + + public const string Builder = "TextServices-Builder/1.0" + Suffix; + public const string Search = "TextServices-Search/1.0" + Suffix; +} diff --git a/src/TextServices.Search.Api/Configuration/ServiceCollectionExtensions.cs b/src/TextServices.Search.Api/Configuration/ServiceCollectionExtensions.cs index da74253..6388993 100644 --- a/src/TextServices.Search.Api/Configuration/ServiceCollectionExtensions.cs +++ b/src/TextServices.Search.Api/Configuration/ServiceCollectionExtensions.cs @@ -1,6 +1,7 @@ using Microsoft.AspNetCore.HttpOverrides; using Serilog; using Serilog.Extensions.Logging; +using TextServices.Infrastructure.Http; using TextServices.Pdf; using TextServices.Search.Api.Features.Pdf; @@ -18,7 +19,7 @@ public static IServiceCollection AddPdfServices(this IServiceCollection services .ConfigureHttpClient(c => { c.Timeout = TimeSpan.FromSeconds(60); - c.DefaultRequestHeaders.UserAgent.ParseAdd("TextServices/1.0 (+https://github.com/dlcs/text-services)"); + c.DefaultRequestHeaders.UserAgent.ParseAdd(UserAgents.Search); }); return services; diff --git a/src/TextServices.Search.Api/Program.cs b/src/TextServices.Search.Api/Program.cs index 19c4c22..7b4a119 100644 --- a/src/TextServices.Search.Api/Program.cs +++ b/src/TextServices.Search.Api/Program.cs @@ -64,7 +64,10 @@ builder.Services.AddDefaultAWSOptions(builder.Configuration.GetAWSOptions()); if (!string.IsNullOrEmpty(builder.Configuration["TextServices:Storage:S3:BucketName"])) +{ + Log.Debug("Using S3 storage for text artefacts"); builder.Services.AddAWSService(); +} builder.Services.AddSingleton(sp => { From cd27bad45b768b403e89b533e80a1138ffebcfb0 Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Mon, 1 Jun 2026 15:41:56 +0100 Subject: [PATCH 10/21] Fixup e2e tests to use appsettings.test.json --- .../appsettings.Test.json | 12 ++++ .../appsettings.Test.json | 9 +++ .../Infrastructure/BuilderApiFactory.cs | 59 ++++++++++++------- .../Infrastructure/SearchApiFactory.cs | 32 +++++++--- 4 files changed, 82 insertions(+), 30 deletions(-) create mode 100644 src/TextServices.Builder.Api/appsettings.Test.json create mode 100644 src/TextServices.Search.Api/appsettings.Test.json diff --git a/src/TextServices.Builder.Api/appsettings.Test.json b/src/TextServices.Builder.Api/appsettings.Test.json new file mode 100644 index 0000000..956f4a1 --- /dev/null +++ b/src/TextServices.Builder.Api/appsettings.Test.json @@ -0,0 +1,12 @@ +{ + "TextServices": { + "Storage": { + "S3": { + "BucketName": "" + } + }, + "Notifications": { + "TopicArn": "" + } + } +} diff --git a/src/TextServices.Search.Api/appsettings.Test.json b/src/TextServices.Search.Api/appsettings.Test.json new file mode 100644 index 0000000..1ab060f --- /dev/null +++ b/src/TextServices.Search.Api/appsettings.Test.json @@ -0,0 +1,9 @@ +{ + "TextServices": { + "Storage": { + "S3": { + "BucketName": "" + } + } + } +} diff --git a/src/TextServices.Tests.E2E/Infrastructure/BuilderApiFactory.cs b/src/TextServices.Tests.E2E/Infrastructure/BuilderApiFactory.cs index 524e4b9..108dfeb 100644 --- a/src/TextServices.Tests.E2E/Infrastructure/BuilderApiFactory.cs +++ b/src/TextServices.Tests.E2E/Infrastructure/BuilderApiFactory.cs @@ -4,6 +4,7 @@ using Microsoft.AspNetCore.TestHost; using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Hosting; using Microsoft.Extensions.Logging.Abstractions; using TextServices.Builder.Api.Data; using TextServices.Builder.Api.Services; @@ -34,33 +35,47 @@ public class BuilderApiFactory(string connectionString, string storageRoot, stri protected override void ConfigureWebHost(IWebHostBuilder builder) { - builder.ConfigureAppConfiguration((_, config) => - config.AddInMemoryCollection(new Dictionary + builder.ConfigureTestServices(services => { - ["ConnectionStrings:BuilderDb"] = connectionString, - ["TextServices:Storage:RootPath"] = StorageRoot, - ["RunMigrations"] = "false" - })); + // ---- Hangfire: replace PostgreSQL storage with InMemory -------------- + services.AddHangfire(config => config.UseInMemoryStorage()); - builder.ConfigureTestServices(services => - { - // ---- Hangfire: replace PostgreSQL storage with InMemory -------------- - services.AddHangfire(config => config.UseInMemoryStorage()); + // ---- Storage: use shared temp directory ----------------------------- + services.AddSingleton(_ => + new FileSystemTextStore( + new FileSystemTextStoreOptions { RootPath = StorageRoot }, + NullLogger.Instance)); - // ---- Storage: use shared temp directory ----------------------------- - services.AddSingleton(_ => - new FileSystemTextStore( - new FileSystemTextStoreOptions { RootPath = StorageRoot }, - NullLogger.Instance)); + // ---- Fetchers: replace with fixture-based stubs --------------------- + services.AddScoped(_ => + new FixtureAltoFetcher(FixturesRoot)); - // ---- Fetchers: replace with fixture-based stubs --------------------- - services.AddScoped(_ => - new FixtureAltoFetcher(FixturesRoot)); + services.AddScoped(sp => + new FixtureManifestFetcher( + FixturesRoot, + sp.GetRequiredService())); + }) + .UseEnvironment("Test") + .UseDefaultServiceProvider((_, options) => + { + options.ValidateScopes = true; + }); + } - services.AddScoped(sp => - new FixtureManifestFetcher( - FixturesRoot, - sp.GetRequiredService())); + protected override IHost CreateHost(IHostBuilder builder) + { + var projectDir = Directory.GetCurrentDirectory(); + var configPath = Path.Combine(projectDir, "appsettings.Test.json"); + + builder.ConfigureHostConfiguration(config => + { + config.AddInMemoryCollection(new Dictionary + { + ["ConnectionStrings:BuilderDb"] = connectionString, + ["TextServices:Storage:FileSystem:RootPath"] = StorageRoot, + }); + config.AddJsonFile(configPath); }); + return base.CreateHost(builder); } } diff --git a/src/TextServices.Tests.E2E/Infrastructure/SearchApiFactory.cs b/src/TextServices.Tests.E2E/Infrastructure/SearchApiFactory.cs index 8f632a9..e0878e4 100644 --- a/src/TextServices.Tests.E2E/Infrastructure/SearchApiFactory.cs +++ b/src/TextServices.Tests.E2E/Infrastructure/SearchApiFactory.cs @@ -3,6 +3,7 @@ using Microsoft.AspNetCore.TestHost; using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Hosting; using Microsoft.Extensions.Logging.Abstractions; using TextServices.Search.Api.Services; using TextServices.Storage; @@ -21,17 +22,32 @@ public class SearchApiFactory(string storageRoot) : WebApplicationFactory - config.AddInMemoryCollection(new Dictionary - { - ["TextServices:Storage:FileSystem:RootPath"] = storageRoot, - ["TextServices:BaseUrl"] = "http://localhost" - })); - builder.ConfigureTestServices(services => services.AddSingleton(_ => new FileSystemTextStore( new FileSystemTextStoreOptions { RootPath = storageRoot }, - NullLogger.Instance))); + NullLogger.Instance))) + .UseEnvironment("Test") + .UseDefaultServiceProvider((_, options) => + { + options.ValidateScopes = true; + }); + } + + protected override IHost CreateHost(IHostBuilder builder) + { + var projectDir = Directory.GetCurrentDirectory(); + var configPath = Path.Combine(projectDir, "appsettings.Test.json"); + + builder.ConfigureHostConfiguration(config => + { + config.AddInMemoryCollection(new Dictionary + { + ["TextServices:Storage:FileSystem:RootPath"] = storageRoot, + ["TextServices:BaseUrl"] = "http://localhost" + }); + config.AddJsonFile(configPath); + }); + return base.CreateHost(builder); } } From a8f6edb8c0735f32c140ae20ccaa0cf91a635fb0 Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Mon, 1 Jun 2026 15:43:19 +0100 Subject: [PATCH 11/21] Move test projects to sln folder --- src/TextServices.sln | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/TextServices.sln b/src/TextServices.sln index ffcec20..4c40bfe 100644 --- a/src/TextServices.sln +++ b/src/TextServices.sln @@ -21,6 +21,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TextServices.Pdf", "TextSer EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TextServices.Infrastructure", "TextServices.Infrastructure\TextServices.Infrastructure.csproj", "{209A6FAD-0476-42F5-92CC-624C677D661A}" EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Testing", "Testing", "{680666A5-6A80-48E3-AEC8-158909FF5FE4}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -143,4 +145,8 @@ Global GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection + GlobalSection(NestedProjects) = preSolution + {AA561B8F-C37B-41A5-808B-489CA2513BB6} = {680666A5-6A80-48E3-AEC8-158909FF5FE4} + {C2A377B6-A433-4BBF-BA0C-A8E8F8A9D6FB} = {680666A5-6A80-48E3-AEC8-158909FF5FE4} + EndGlobalSection EndGlobal From c5bc012e691f7e40c6ab26da2289e26df322455a Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Mon, 1 Jun 2026 16:12:56 +0100 Subject: [PATCH 12/21] Add AWSSDK.Security token nuget package Required for local development against AWS --- src/TextServices.Builder.Api/TextServices.Builder.Api.csproj | 1 + src/TextServices.Search.Api/TextServices.Search.Api.csproj | 1 + 2 files changed, 2 insertions(+) diff --git a/src/TextServices.Builder.Api/TextServices.Builder.Api.csproj b/src/TextServices.Builder.Api/TextServices.Builder.Api.csproj index bb7933c..8f90824 100644 --- a/src/TextServices.Builder.Api/TextServices.Builder.Api.csproj +++ b/src/TextServices.Builder.Api/TextServices.Builder.Api.csproj @@ -10,6 +10,7 @@ + diff --git a/src/TextServices.Search.Api/TextServices.Search.Api.csproj b/src/TextServices.Search.Api/TextServices.Search.Api.csproj index 91bbebc..bac02be 100644 --- a/src/TextServices.Search.Api/TextServices.Search.Api.csproj +++ b/src/TextServices.Search.Api/TextServices.Search.Api.csproj @@ -9,6 +9,7 @@ + From def6a3619281d77aab7e5b43aede0a3b1d013c8a Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Mon, 1 Jun 2026 16:13:38 +0100 Subject: [PATCH 13/21] Remove unnecessary logging --- .../Configuration/ServiceCollectionExtensions.cs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/TextServices.Builder.Api/Configuration/ServiceCollectionExtensions.cs b/src/TextServices.Builder.Api/Configuration/ServiceCollectionExtensions.cs index d60bd14..3b49cf1 100644 --- a/src/TextServices.Builder.Api/Configuration/ServiceCollectionExtensions.cs +++ b/src/TextServices.Builder.Api/Configuration/ServiceCollectionExtensions.cs @@ -34,8 +34,6 @@ public static IServiceCollection AddHangfireServices(this IServiceCollection ser public static IServiceCollection AddAwsServices(this IServiceCollection services, IConfiguration configuration) { - var logger = new SerilogLoggerFactory(Log.Logger).CreateLogger(nameof(ServiceCollectionExtensions)); - services.AddDefaultAWSOptions(configuration.GetAWSOptions()); services.Configure(configuration.GetSection("TextServices:Storage:S3")); @@ -43,13 +41,11 @@ public static IServiceCollection AddAwsServices(this IServiceCollection services // chain at startup, which hangs in test environments without real credentials. if (!string.IsNullOrEmpty(configuration["TextServices:Storage:S3:BucketName"])) { - logger.LogDebug("Using S3 storage for text artefacts"); services.AddAWSService(); } if (!string.IsNullOrEmpty(configuration["TextServices:Notifications:TopicArn"])) { - logger.LogDebug("Configuring SNS notifications for job status changes"); services.AddAWSService(); } From 88753f7bd175a7f965340e674cf9bb0351a6a72a Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Mon, 1 Jun 2026 16:23:17 +0100 Subject: [PATCH 14/21] Make catch-all route ID parameters required Replace {**id} (optional catch-all) with {*id:minlength(1)} across all endpoints in both APIs. {**id} matched empty strings, meaning routes like DELETE /textbuilder/ would reach the handler with an empty id. The minlength(1) constraint causes the route not to match at all, returning 404 cleanly. Add E2E tests verifying that mutation and resource endpoints without an id segment return 404. Co-Authored-By: Claude Sonnet 4.6 --- .../Features/Jobs/JobEndpoints.cs | 6 ++-- .../Annotations/AnnotationEndpoints.cs | 6 ++-- .../Autocomplete/AutocompleteEndpoints.cs | 4 +-- .../Features/Figures/FiguresEndpoints.cs | 2 +- .../Features/Pdf/PdfEndpoints.cs | 4 +-- .../Features/PlainText/PlainTextEndpoints.cs | 2 +- .../Features/Search/SearchEndpoints.cs | 4 +-- .../TextAugmented/TextAugmentedEndpoints.cs | 2 +- .../Services/CacheEndpoints.cs | 2 +- .../BuildAndSearchTests.cs | 29 +++++++++++++++++++ 10 files changed, 45 insertions(+), 16 deletions(-) diff --git a/src/TextServices.Builder.Api/Features/Jobs/JobEndpoints.cs b/src/TextServices.Builder.Api/Features/Jobs/JobEndpoints.cs index a1eacd5..0fe19da 100644 --- a/src/TextServices.Builder.Api/Features/Jobs/JobEndpoints.cs +++ b/src/TextServices.Builder.Api/Features/Jobs/JobEndpoints.cs @@ -35,13 +35,13 @@ internal static IEndpointRouteBuilder MapJobEndpoints(this IEndpointRouteBuilder return Results.Ok(result); }); - routes.MapGet("/textbuilder/{**id}", async (string id, ISender sender) => + routes.MapGet("/textbuilder/{*id:minlength(1)}", async (string id, ISender sender) => { var response = await sender.Send(new GetJobRequest(id)); return response == null ? Results.NotFound() : Results.Ok(response); }); - routes.MapPut("/textbuilder/{**id}", async (string id, ISender sender) => + routes.MapPut("/textbuilder/{*id:minlength(1)}", async (string id, ISender sender) => { var result = await sender.Send(new ReprocessJobRequest(id)); return result.Status switch @@ -52,7 +52,7 @@ internal static IEndpointRouteBuilder MapJobEndpoints(this IEndpointRouteBuilder }; }); - routes.MapDelete("/textbuilder/{**id}", async (string id, ISender sender) => + routes.MapDelete("/textbuilder/{*id:minlength(1)}", async (string id, ISender sender) => { var found = await sender.Send(new DeleteJobRequest(id)); return found ? Results.NoContent() : Results.NotFound(); diff --git a/src/TextServices.Search.Api/Features/Annotations/AnnotationEndpoints.cs b/src/TextServices.Search.Api/Features/Annotations/AnnotationEndpoints.cs index cc20fae..f45aa11 100644 --- a/src/TextServices.Search.Api/Features/Annotations/AnnotationEndpoints.cs +++ b/src/TextServices.Search.Api/Features/Annotations/AnnotationEndpoints.cs @@ -8,7 +8,7 @@ internal static class AnnotationEndpoints { internal static IEndpointRouteBuilder MapAnnotationEndpoints(this IEndpointRouteBuilder routes) { - routes.MapGet("/annotations/manifest/v1/{**id}", async ( + routes.MapGet("/annotations/manifest/v1/{*id:minlength(1)}", async ( string id, ISender sender, IOptions options, @@ -20,7 +20,7 @@ internal static IEndpointRouteBuilder MapAnnotationEndpoints(this IEndpointRoute return Results.Json(result, contentType: "application/ld+json"); }); - routes.MapGet("/annotations/lines/v1/{n:int}/{**id}", async ( + routes.MapGet("/annotations/lines/v1/{n:int}/{*id:minlength(1)}", async ( int n, string id, ISender sender, IOptions options, @@ -32,7 +32,7 @@ internal static IEndpointRouteBuilder MapAnnotationEndpoints(this IEndpointRoute return Results.Json(result, contentType: "application/ld+json"); }); - routes.MapGet("/annotations/words/v1/{n:int}/{**id}", async ( + routes.MapGet("/annotations/words/v1/{n:int}/{*id:minlength(1)}", async ( int n, string id, ISender sender, IOptions options, diff --git a/src/TextServices.Search.Api/Features/Autocomplete/AutocompleteEndpoints.cs b/src/TextServices.Search.Api/Features/Autocomplete/AutocompleteEndpoints.cs index 30b2ed9..241c3dd 100644 --- a/src/TextServices.Search.Api/Features/Autocomplete/AutocompleteEndpoints.cs +++ b/src/TextServices.Search.Api/Features/Autocomplete/AutocompleteEndpoints.cs @@ -8,7 +8,7 @@ internal static class AutocompleteEndpoints { internal static IEndpointRouteBuilder MapAutocompleteEndpoints(this IEndpointRouteBuilder routes) { - routes.MapGet("/autocomplete/v1/{**id}", async ( + routes.MapGet("/autocomplete/v1/{*id:minlength(1)}", async ( string id, string? q, ISender sender, IOptions options, @@ -20,7 +20,7 @@ internal static IEndpointRouteBuilder MapAutocompleteEndpoints(this IEndpointRou return Results.Json(result, contentType: "application/ld+json"); }); - routes.MapGet("/autocomplete/v2/{**id}", async ( + routes.MapGet("/autocomplete/v2/{*id:minlength(1)}", async ( string id, string? q, ISender sender, IOptions options, diff --git a/src/TextServices.Search.Api/Features/Figures/FiguresEndpoints.cs b/src/TextServices.Search.Api/Features/Figures/FiguresEndpoints.cs index fcf84ed..25a9ab0 100644 --- a/src/TextServices.Search.Api/Features/Figures/FiguresEndpoints.cs +++ b/src/TextServices.Search.Api/Features/Figures/FiguresEndpoints.cs @@ -8,7 +8,7 @@ internal static class FiguresEndpoints { internal static IEndpointRouteBuilder MapFiguresEndpoints(this IEndpointRouteBuilder routes) { - routes.MapGet("/identified/figures/{**id}", async ( + routes.MapGet("/identified/figures/{*id:minlength(1)}", async ( string id, ISender sender, IOptions options, diff --git a/src/TextServices.Search.Api/Features/Pdf/PdfEndpoints.cs b/src/TextServices.Search.Api/Features/Pdf/PdfEndpoints.cs index 00df646..faf4e4c 100644 --- a/src/TextServices.Search.Api/Features/Pdf/PdfEndpoints.cs +++ b/src/TextServices.Search.Api/Features/Pdf/PdfEndpoints.cs @@ -8,7 +8,7 @@ internal static class PdfEndpoints { internal static IEndpointRouteBuilder MapPdfEndpoints(this IEndpointRouteBuilder routes) { - routes.MapGet("/pdf/v1/{**id}", async (string id, ISender sender) => + routes.MapGet("/pdf/v1/{*id:minlength(1)}", async (string id, ISender sender) => { id = StripPdfExtension(id); var stream = await sender.Send(new PdfRequest(id)); @@ -16,7 +16,7 @@ internal static IEndpointRouteBuilder MapPdfEndpoints(this IEndpointRouteBuilder return Results.Stream(stream, "application/pdf", enableRangeProcessing: false); }); - routes.MapPost("/pdf/v1/{**id}", async ( + routes.MapPost("/pdf/v1/{*id:minlength(1)}", async ( string id, ISender sender, IOptions options, diff --git a/src/TextServices.Search.Api/Features/PlainText/PlainTextEndpoints.cs b/src/TextServices.Search.Api/Features/PlainText/PlainTextEndpoints.cs index 4599474..14c91df 100644 --- a/src/TextServices.Search.Api/Features/PlainText/PlainTextEndpoints.cs +++ b/src/TextServices.Search.Api/Features/PlainText/PlainTextEndpoints.cs @@ -6,7 +6,7 @@ internal static class PlainTextEndpoints { internal static IEndpointRouteBuilder MapPlainTextEndpoints(this IEndpointRouteBuilder routes) { - routes.MapGet("/text/v1/{**id}", async (string id, ISender sender) => + routes.MapGet("/text/v1/{*id:minlength(1)}", async (string id, ISender sender) => { var result = await sender.Send(new RawTextRequest(id)); if (result == null) return Results.NotFound(); diff --git a/src/TextServices.Search.Api/Features/Search/SearchEndpoints.cs b/src/TextServices.Search.Api/Features/Search/SearchEndpoints.cs index c6b480b..78045ef 100644 --- a/src/TextServices.Search.Api/Features/Search/SearchEndpoints.cs +++ b/src/TextServices.Search.Api/Features/Search/SearchEndpoints.cs @@ -8,7 +8,7 @@ internal static class SearchEndpoints { internal static IEndpointRouteBuilder MapSearchEndpoints(this IEndpointRouteBuilder routes) { - routes.MapGet("/search/v1/{**id}", async ( + routes.MapGet("/search/v1/{*id:minlength(1)}", async ( string id, string? q, ISender sender, IOptions options, @@ -21,7 +21,7 @@ internal static IEndpointRouteBuilder MapSearchEndpoints(this IEndpointRouteBuil return Results.Json(result, contentType: "application/ld+json"); }); - routes.MapGet("/search/v2/{**id}", async ( + routes.MapGet("/search/v2/{*id:minlength(1)}", async ( string id, string? q, ISender sender, IOptions options, diff --git a/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedEndpoints.cs b/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedEndpoints.cs index d008cb2..595ceaf 100644 --- a/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedEndpoints.cs +++ b/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedEndpoints.cs @@ -8,7 +8,7 @@ internal static class TextAugmentedEndpoints { internal static IEndpointRouteBuilder MapTextAugmentedEndpoints(this IEndpointRouteBuilder routes) { - routes.MapGet("/text-augmented/v3/{**id}", async ( + routes.MapGet("/text-augmented/v3/{*id:minlength(1)}", async ( string id, ISender sender, IOptions options, diff --git a/src/TextServices.Search.Api/Services/CacheEndpoints.cs b/src/TextServices.Search.Api/Services/CacheEndpoints.cs index a20a7b4..31407f8 100644 --- a/src/TextServices.Search.Api/Services/CacheEndpoints.cs +++ b/src/TextServices.Search.Api/Services/CacheEndpoints.cs @@ -4,7 +4,7 @@ internal static class CacheEndpoints { internal static IEndpointRouteBuilder MapCacheEndpoints(this IEndpointRouteBuilder routes) { - routes.MapDelete("/cache/v1/{**id}", (string id, ITextCache textCache) => + routes.MapDelete("/cache/v1/{*id:minlength(1)}", (string id, ITextCache textCache) => { textCache.Invalidate(id); return Results.NoContent(); diff --git a/src/TextServices.Tests.E2E/BuildAndSearchTests.cs b/src/TextServices.Tests.E2E/BuildAndSearchTests.cs index 2282c5b..b8fcc4a 100644 --- a/src/TextServices.Tests.E2E/BuildAndSearchTests.cs +++ b/src/TextServices.Tests.E2E/BuildAndSearchTests.cs @@ -400,6 +400,35 @@ public async Task ServiceFlags_SearchEnabled_AutocompleteDisabled_GatesEndpoints acResponse.StatusCode.ShouldBe(HttpStatusCode.NotFound); } + // ------------------------------------------------------------------------- + // Route safety — ID is required on all mutation / resource endpoints + // ------------------------------------------------------------------------- + + [Theory] + [InlineData("/textbuilder/", "DELETE")] + [InlineData("/textbuilder/", "PUT")] + public async Task BuilderApi_MutationWithoutId_Returns404(string path, string method) + { + var request = new HttpRequestMessage(new HttpMethod(method), path); + var response = await ctx.BuilderClient.SendAsync(request); + + response.StatusCode.ShouldBe(HttpStatusCode.NotFound); + } + + [Theory] + [InlineData("/search/v1/")] + [InlineData("/search/v2/")] + [InlineData("/autocomplete/v1/")] + [InlineData("/autocomplete/v2/")] + [InlineData("/text-augmented/v3/")] + [InlineData("/text/v1/")] + [InlineData("/annotations/manifest/v1/")] + public async Task SearchApi_ResourceEndpointWithoutId_Returns404(string path) + { + var response = await ctx.SearchClient.GetAsync(path); + response.StatusCode.ShouldBe(HttpStatusCode.NotFound); + } + // ------------------------------------------------------------------------- // Helpers // ------------------------------------------------------------------------- From aff80df68f0955ccb3eeaaa39e51ac6c6f7f49ab Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Mon, 1 Jun 2026 16:48:59 +0100 Subject: [PATCH 15/21] Inject IIIF Search @context and gate services on text existence * InjectSearchServices now appends the v2 and v1 Search API context URLs to @context (promoting a string value to an array as needed, with deduplication). Context belongs at the manifest level alongside the service descriptors it enables. * All three inject methods (search services, rendering links, canvas annotation refs) are now skipped when text is null. If there is no indexed text there is nothing to link to; previously search service descriptors were always written regardless. * InjectRenderingLinks and InjectCanvasAnnotationRefs tightened to non-nullable Text parameter now that the null guard lives at the call site. Co-Authored-By: Claude Sonnet 4.6 --- .../TextAugmented/TextAugmentedQuery.cs | 42 +++++++-- .../SearchApi/TextAugmentedHandlerTests.cs | 91 ++++++++++++++++++- 2 files changed, 122 insertions(+), 11 deletions(-) diff --git a/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedQuery.cs b/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedQuery.cs index 68ad44e..da5f628 100644 --- a/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedQuery.cs +++ b/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedQuery.cs @@ -55,9 +55,17 @@ public class TextAugmentedHandler(ITextStore textStore, ITextCache textCache, IL var text = await textCache.GetTextAsync(request.Id, ct); ReplaceSelfUrl(manifest, request.SelfUrl); - InjectSearchServices(manifest, baseUrl, id); - InjectRenderingLinks(manifest, baseUrl, id, text); - InjectCanvasAnnotationRefs(manifest, baseUrl, id, text); + if (text != null) + { + InjectSearchServices(manifest, baseUrl, id); + InjectRenderingLinks(manifest, baseUrl, id, text); + InjectCanvasAnnotationRefs(manifest, baseUrl, id, text); + } + else + { + logger.LogDebug("Text not found: {Id}", request.Id); + } + await InjectManifestAnnotationsRefAsync(manifest, baseUrl, id, request.Id); await InjectFiguresRefAsync(manifest, baseUrl, id, request.Id); @@ -119,12 +127,13 @@ private void InjectSearchServices(JsonObject manifest, string baseUrl, string id { manifest["service"] = new JsonArray(searchServiceV2, searchServiceV1); } + + AppendContext(manifest, "http://iiif.io/api/search/2/context.json"); + AppendContext(manifest, "http://iiif.io/api/search/1/context.json"); } - private void InjectRenderingLinks(JsonObject manifest, string baseUrl, string id, Text? text) + private void InjectRenderingLinks(JsonObject manifest, string baseUrl, string id, Text text) { - if (text == null) return; - logger.LogDebug("Adding rendering: {Id}", id); // PDF is only added when at least one canvas is image-based (IsTemporalContent == false); @@ -159,9 +168,9 @@ private void InjectRenderingLinks(JsonObject manifest, string baseUrl, string id } } - private void InjectCanvasAnnotationRefs(JsonObject manifest, string baseUrl, string id, Text? text) + private void InjectCanvasAnnotationRefs(JsonObject manifest, string baseUrl, string id, Text text) { - if (text == null || manifest["items"] is not JsonArray canvases) return; + if (manifest["items"] is not JsonArray canvases) return; logger.LogDebug("Adding canvas annotations: {Id}", id); @@ -276,6 +285,23 @@ private static void AddIfNew(JsonArray array, JsonObject item, bool prepend = tr else array.Add(item); } + private static void AppendContext(JsonObject manifest, string url) + { + if (manifest["@context"] is JsonArray arr) + { + if (!arr.Any(n => n?.GetValue() == url)) + arr.Add(url); + } + else if (manifest["@context"] is JsonValue str) + { + manifest["@context"] = new JsonArray(str.GetValue(), url); + } + else + { + manifest["@context"] = new JsonArray(url); + } + } + private static JsonObject BuildPdfRef(string baseUrl, string id) => new() { ["id"] = $"{baseUrl}/pdf/v1/{id}", diff --git a/src/TextServices.Tests/SearchApi/TextAugmentedHandlerTests.cs b/src/TextServices.Tests/SearchApi/TextAugmentedHandlerTests.cs index 651205b..23e23f8 100644 --- a/src/TextServices.Tests/SearchApi/TextAugmentedHandlerTests.cs +++ b/src/TextServices.Tests/SearchApi/TextAugmentedHandlerTests.cs @@ -35,6 +35,19 @@ public async Task Handle_ManifestNotFound_ReturnsNull() result.ShouldBeNull(); } + [Fact] + public async Task Handle_NoText_ReturnsManifestWithoutSearchServicesOrContext() + { + var handler = MakeHandler(V3Manifest(), noText: true); + + var result = await handler.Handle( + new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + + result.ShouldNotBeNull(); + result["service"].ShouldBeNull(); + result["@context"]!.GetValue().ShouldBe("http://iiif.io/api/presentation/3/context.json"); + } + // ------------------------------------------------------------------------- // @id replacement // ------------------------------------------------------------------------- @@ -209,7 +222,7 @@ public async Task Handle_TemporalOnlyContent_AddsPlainTextButNoPdfLink() [Fact] public async Task Handle_TextNotBuilt_NoRenderingLinks() { - var handler = MakeHandler(V3Manifest(), cachedText: null); + var handler = MakeHandler(V3Manifest(), noText: true); var result = await handler.Handle( new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); @@ -417,12 +430,81 @@ public async Task Handle_AnnotationsArrayAlreadyContainsFiguresRef_DoesNotDuplic "https://search.example.org/identified/figures/test/book").ShouldBe(1); } + // ------------------------------------------------------------------------- + // @context injection + // ------------------------------------------------------------------------- + + [Fact] + public async Task Handle_NoContext_AddsSearchContexts() + { + var handler = MakeHandler("""{"id":"https://example.org/m/1","type":"Manifest"}"""); + + var result = await handler.Handle( + new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + + var ctx = result!["@context"].ShouldBeOfType(); + ctx.Select(n => n!.GetValue()).ShouldBe( + [ + "http://iiif.io/api/search/2/context.json", + "http://iiif.io/api/search/1/context.json", + ]); + } + + [Fact] + public async Task Handle_StringContext_PromotesToArrayAndAppendsSearchContexts() + { + var handler = MakeHandler(V3Manifest()); + + var result = await handler.Handle( + new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + + var ctx = result!["@context"].ShouldBeOfType(); + ctx.Select(n => n!.GetValue()).ShouldBe( + [ + "http://iiif.io/api/presentation/3/context.json", + "http://iiif.io/api/search/2/context.json", + "http://iiif.io/api/search/1/context.json", + ]); + } + + [Fact] + public async Task Handle_ArrayContext_AppendsSearchContexts() + { + var handler = MakeHandler("""{"id":"https://example.org/m/1","type":"Manifest","@context":["http://iiif.io/api/presentation/3/context.json"]}"""); + + var result = await handler.Handle( + new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + + var ctx = result!["@context"].ShouldBeOfType(); + ctx.Select(n => n!.GetValue()).ShouldBe( + [ + "http://iiif.io/api/presentation/3/context.json", + "http://iiif.io/api/search/2/context.json", + "http://iiif.io/api/search/1/context.json", + ]); + } + + [Fact] + public async Task Handle_ContextAlreadyContainsSearchUrls_DoesNotDuplicate() + { + var handler = MakeHandler(V3ManifestWithSearchContexts()); + + var result = await handler.Handle( + new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + + var ctx = result!["@context"].ShouldBeOfType(); + ctx.Count(n => n!.GetValue() == "http://iiif.io/api/search/2/context.json").ShouldBe(1); + ctx.Count(n => n!.GetValue() == "http://iiif.io/api/search/1/context.json").ShouldBe(1); + } + private static TextAugmentedHandler MakeHandler( string? manifestJson, string? annotationsJson = null, string? figuresJson = null, - Text? cachedText = null) - => new(new StubTextStore(manifestJson, annotationsJson, figuresJson), new StubTextCache(cachedText), + Text? cachedText = null, + bool noText = false) + => new(new StubTextStore(manifestJson, annotationsJson, figuresJson), + new StubTextCache(noText ? null : (cachedText ?? SpatialText())), new NullLogger()); /// A Text with one spatial (image-based) canvas. @@ -482,6 +564,9 @@ private static string V3ManifestWithExistingAnnotationsRef() private static string V3ManifestWithExistingFiguresRef() => """{"id":"https://example.org/m/1","type":"Manifest","annotations":[{"id":"https://search.example.org/identified/figures/test/book","type":"AnnotationPage"}]}"""; + private static string V3ManifestWithSearchContexts() + => """{"id":"https://example.org/m/1","type":"Manifest","@context":["http://iiif.io/api/presentation/3/context.json","http://iiif.io/api/search/2/context.json","http://iiif.io/api/search/1/context.json"]}"""; + private sealed class StubTextStore( string? manifestJson, string? annotationsJson = null, From 205e44369fc3240cd4003420396d0b0a39e69fbc Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Mon, 1 Jun 2026 16:56:46 +0100 Subject: [PATCH 16/21] Improve hangfire logging --- .../Features/Jobs/DeleteJob.cs | 3 +- .../Jobs/LogContextHelpers.cs | 14 +++ .../Jobs/TextBuildJob.cs | 89 ++++++++++--------- .../Services/Notifications/SnsJobNotifier.cs | 1 + 4 files changed, 62 insertions(+), 45 deletions(-) create mode 100644 src/TextServices.Builder.Api/Jobs/LogContextHelpers.cs diff --git a/src/TextServices.Builder.Api/Features/Jobs/DeleteJob.cs b/src/TextServices.Builder.Api/Features/Jobs/DeleteJob.cs index ee5d7c0..683b9df 100644 --- a/src/TextServices.Builder.Api/Features/Jobs/DeleteJob.cs +++ b/src/TextServices.Builder.Api/Features/Jobs/DeleteJob.cs @@ -23,8 +23,7 @@ public async Task Handle(DeleteJobRequest request, CancellationToken ct) var job = await db.Jobs.FindAsync([request.Id], ct); if (job == null) return false; - if (job.HangfireJobId != null) - hangfire.Delete(job.HangfireJobId); + if (job.HangfireJobId != null) hangfire.Delete(job.HangfireJobId); await textStore.DeleteArtefacts(job.Id); diff --git a/src/TextServices.Builder.Api/Jobs/LogContextHelpers.cs b/src/TextServices.Builder.Api/Jobs/LogContextHelpers.cs new file mode 100644 index 0000000..a48714f --- /dev/null +++ b/src/TextServices.Builder.Api/Jobs/LogContextHelpers.cs @@ -0,0 +1,14 @@ +using Serilog.Context; +using Serilog.Core.Enrichers; + +namespace TextServices.Builder.Api.Jobs; + +internal static class LogContextHelpers +{ + /// + /// "CorrelationId" properties to log context, which is then output as part of default log template. + /// This is useful for filtering logs. Consists of {jobId}:{random-guid} + /// + public static IDisposable SetCorrelationId(string jobId) => + LogContext.Push(new PropertyEnricher("CorrelationId", $"{jobId}:{Guid.NewGuid()}")); +} diff --git a/src/TextServices.Builder.Api/Jobs/TextBuildJob.cs b/src/TextServices.Builder.Api/Jobs/TextBuildJob.cs index 9b305e8..7fe979d 100644 --- a/src/TextServices.Builder.Api/Jobs/TextBuildJob.cs +++ b/src/TextServices.Builder.Api/Jobs/TextBuildJob.cs @@ -46,57 +46,60 @@ private record FetchedPage(PageInstruction Page, XElement? Xml, string? StringCo [JobDisplayName("TextBuild: {0}")] public async Task ExecuteAsync(string jobId, IJobCancellationToken cancellationToken) { - var job = await db.Jobs.FindAsync(jobId); - if (job == null) + using (LogContextHelpers.SetCorrelationId(jobId)) { - logger.LogWarning("TextBuildJob invoked for unknown job {JobId}", jobId); - return; - } - - job.Status = JobStatus.Running; - job.Started = DateTimeOffset.UtcNow; - await db.SaveChangesAsync(); + var job = await db.Jobs.FindAsync(jobId); + if (job == null) + { + logger.LogWarning("TextBuildJob invoked for unknown job {JobId}", jobId); + return; + } - try - { - var pages = await GetPages(job, cancellationToken.ShutdownToken); - job.TotalPages = pages.Count; + job.Status = JobStatus.Running; + job.Started = DateTimeOffset.UtcNow; await db.SaveChangesAsync(); - var (wordCount, imageCount, errors, fulfilled) = - await ProcessPages(job, pages, cancellationToken); + try + { + var pages = await GetPages(job, cancellationToken.ShutdownToken); + job.TotalPages = pages.Count; + await db.SaveChangesAsync(); - job.TotalWordCount = wordCount; - job.TotalImageCount = imageCount; - job.Errors = errors.Count > 0 ? string.Join('\n', errors) : null; - job.FulfilledServices = (int)fulfilled; - job.Status = JobStatus.Completed; - job.Finished = DateTimeOffset.UtcNow; - await db.SaveChangesAsync(); + var (wordCount, imageCount, errors, fulfilled) = + await ProcessPages(job, pages, cancellationToken); - logger.LogInformation( - "TextBuildJob completed for {JobId}: {WordCount} words, {ImageCount} images, " + - "{ErrorCount} page error(s)", - jobId, wordCount, imageCount, errors.Count); + job.TotalWordCount = wordCount; + job.TotalImageCount = imageCount; + job.Errors = errors.Count > 0 ? string.Join('\n', errors) : null; + job.FulfilledServices = (int)fulfilled; + job.Status = JobStatus.Completed; + job.Finished = DateTimeOffset.UtcNow; + await db.SaveChangesAsync(); - await jobNotifier.Notify( - new JobCompletionNotification(job.Id, job.Status, job.Finished, - job.TotalPages, job.TotalWordCount, job.Errors), - cancellationToken.ShutdownToken); - } - catch (Exception ex) - { - logger.LogError(ex, "TextBuildJob failed for {JobId}", jobId); - job.Status = JobStatus.Failed; - job.Errors = ex.Message; - job.FulfilledServices = (int)JobServices.None; - job.Finished = DateTimeOffset.UtcNow; - await db.SaveChangesAsync(); + logger.LogInformation( + "TextBuildJob completed for {JobId}: {WordCount} words, {ImageCount} images, " + + "{ErrorCount} page error(s)", + jobId, wordCount, imageCount, errors.Count); - await jobNotifier.Notify( - new JobCompletionNotification(job.Id, job.Status, job.Finished, - job.TotalPages, job.TotalWordCount, job.Errors), - cancellationToken.ShutdownToken); + await jobNotifier.Notify( + new JobCompletionNotification(job.Id, job.Status, job.Finished, + job.TotalPages, job.TotalWordCount, job.Errors), + cancellationToken.ShutdownToken); + } + catch (Exception ex) + { + logger.LogError(ex, "TextBuildJob failed for {JobId}", jobId); + job.Status = JobStatus.Failed; + job.Errors = ex.Message; + job.FulfilledServices = (int)JobServices.None; + job.Finished = DateTimeOffset.UtcNow; + await db.SaveChangesAsync(); + + await jobNotifier.Notify( + new JobCompletionNotification(job.Id, job.Status, job.Finished, + job.TotalPages, job.TotalWordCount, job.Errors), + cancellationToken.ShutdownToken); + } } } diff --git a/src/TextServices.Builder.Api/Services/Notifications/SnsJobNotifier.cs b/src/TextServices.Builder.Api/Services/Notifications/SnsJobNotifier.cs index 26e8aff..3391275 100644 --- a/src/TextServices.Builder.Api/Services/Notifications/SnsJobNotifier.cs +++ b/src/TextServices.Builder.Api/Services/Notifications/SnsJobNotifier.cs @@ -20,6 +20,7 @@ public async Task Notify(JobCompletionNotification notification, CancellationTok try { var messageType = notification.Status == JobStatus.Completed ? "JobCompleted" : "JobFailed"; + logger.LogDebug("Raising {MessageType} notification: {JobId}", messageType, notification.JobId); await sns.PublishAsync(new PublishRequest { From 37192fd0a6f5019ebb36819310de5b9998d8b50a Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Tue, 2 Jun 2026 10:31:10 +0100 Subject: [PATCH 17/21] Rename file to match type --- .../{TextAugmentedQuery.cs => TextAugmentedRequest.cs} | 1 + 1 file changed, 1 insertion(+) rename src/TextServices.Search.Api/Features/TextAugmented/{TextAugmentedQuery.cs => TextAugmentedRequest.cs} (99%) diff --git a/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedQuery.cs b/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedRequest.cs similarity index 99% rename from src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedQuery.cs rename to src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedRequest.cs index da5f628..ddb4548 100644 --- a/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedQuery.cs +++ b/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedRequest.cs @@ -4,6 +4,7 @@ using TextServices.Search.Api.Services; using TextServices.Storage; + namespace TextServices.Search.Api.Features.TextAugmented; /// From a63dc0043aa2ba8ca22e4813cd39f495b0337c85 Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Tue, 2 Jun 2026 11:34:12 +0100 Subject: [PATCH 18/21] Downgrade mediatr --- src/TextServices.Builder.Api/TextServices.Builder.Api.csproj | 2 +- src/TextServices.Search.Api/TextServices.Search.Api.csproj | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TextServices.Builder.Api/TextServices.Builder.Api.csproj b/src/TextServices.Builder.Api/TextServices.Builder.Api.csproj index 8f90824..2d4814a 100644 --- a/src/TextServices.Builder.Api/TextServices.Builder.Api.csproj +++ b/src/TextServices.Builder.Api/TextServices.Builder.Api.csproj @@ -16,7 +16,7 @@ - + runtime; build; native; contentfiles; analyzers; buildtransitive diff --git a/src/TextServices.Search.Api/TextServices.Search.Api.csproj b/src/TextServices.Search.Api/TextServices.Search.Api.csproj index bac02be..9a111c4 100644 --- a/src/TextServices.Search.Api/TextServices.Search.Api.csproj +++ b/src/TextServices.Search.Api/TextServices.Search.Api.csproj @@ -10,7 +10,7 @@ - + From a88ee577569dfbaed45a9ecfbe362c610c35d253 Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Tue, 2 Jun 2026 11:34:31 +0100 Subject: [PATCH 19/21] Extra logging, make CancellationTokens required --- src/TextServices.Builder.Api/Jobs/TextBuildJob.cs | 2 +- src/TextServices.Builder.Api/Services/AltoFetcher.cs | 5 +++-- .../Services/AnnotationPageFetcher.cs | 4 +--- src/TextServices.Builder.Api/Services/IAltoFetcher.cs | 2 +- .../Services/IAnnotationPageFetcher.cs | 2 +- .../Services/IManifestFetcher.cs | 2 +- .../Services/IResourceFetcher.cs | 2 +- src/TextServices.Builder.Api/Services/IVttFetcher.cs | 2 +- src/TextServices.Builder.Api/Services/ManifestFetcher.cs | 2 +- src/TextServices.Builder.Api/Services/ResourceFetcher.cs | 9 +++------ src/TextServices.Builder.Api/Services/VttFetcher.cs | 4 +--- .../Features/Pdf/PdfGenerationService.cs | 4 ++-- 12 files changed, 17 insertions(+), 23 deletions(-) diff --git a/src/TextServices.Builder.Api/Jobs/TextBuildJob.cs b/src/TextServices.Builder.Api/Jobs/TextBuildJob.cs index 7fe979d..2d404d3 100644 --- a/src/TextServices.Builder.Api/Jobs/TextBuildJob.cs +++ b/src/TextServices.Builder.Api/Jobs/TextBuildJob.cs @@ -503,7 +503,7 @@ private async Task FetchPageAsync(PageInstruction page, Cancellatio } catch (Exception ex) { - logger.LogWarning(ex, "Failed to fetch page {CanvasId} from {Url}", page.Id, page.TextUri); + logger.LogWarning(ex, "Failed to fetch {Url} for canvas {CanvasId}", page.TextUri, page.Id); return new FetchedPage(page, null, null, $"{page.Id}: {ex.Message}"); } } diff --git a/src/TextServices.Builder.Api/Services/AltoFetcher.cs b/src/TextServices.Builder.Api/Services/AltoFetcher.cs index 125f57b..2e13891 100644 --- a/src/TextServices.Builder.Api/Services/AltoFetcher.cs +++ b/src/TextServices.Builder.Api/Services/AltoFetcher.cs @@ -2,10 +2,11 @@ namespace TextServices.Builder.Api.Services; -public sealed class AltoFetcher(IResourceFetcher fetcher) : IAltoFetcher +public sealed class AltoFetcher(IResourceFetcher fetcher, ILogger logger) : IAltoFetcher { - public async Task FetchAsync(string uri, CancellationToken ct = default) + public async Task FetchAsync(string uri, CancellationToken ct) { + logger.LogDebug("Fetching alto: {Uri}", uri); await using var stream = await fetcher.FetchAsync(uri, ct); if (stream is null) return null; diff --git a/src/TextServices.Builder.Api/Services/AnnotationPageFetcher.cs b/src/TextServices.Builder.Api/Services/AnnotationPageFetcher.cs index 8983ba1..0a833bd 100644 --- a/src/TextServices.Builder.Api/Services/AnnotationPageFetcher.cs +++ b/src/TextServices.Builder.Api/Services/AnnotationPageFetcher.cs @@ -1,12 +1,10 @@ -using Microsoft.Extensions.Logging; - namespace TextServices.Builder.Api.Services; public sealed class AnnotationPageFetcher( IResourceFetcher fetcher, ILogger logger) : IAnnotationPageFetcher { - public async Task FetchAsync(string uri, CancellationToken ct = default) + public async Task FetchAsync(string uri, CancellationToken ct) { logger.LogDebug("Fetching annotation page: {Uri}", uri); await using var stream = await fetcher.FetchAsync(uri, ct); diff --git a/src/TextServices.Builder.Api/Services/IAltoFetcher.cs b/src/TextServices.Builder.Api/Services/IAltoFetcher.cs index fc40784..910c693 100644 --- a/src/TextServices.Builder.Api/Services/IAltoFetcher.cs +++ b/src/TextServices.Builder.Api/Services/IAltoFetcher.cs @@ -13,5 +13,5 @@ public interface IAltoFetcher /// or the response body is empty — both treated as a sparse page. /// Throws for other HTTP errors or parse failures. /// - Task FetchAsync(string uri, CancellationToken ct = default); + Task FetchAsync(string uri, CancellationToken ct); } diff --git a/src/TextServices.Builder.Api/Services/IAnnotationPageFetcher.cs b/src/TextServices.Builder.Api/Services/IAnnotationPageFetcher.cs index a4c5e79..cbd8beb 100644 --- a/src/TextServices.Builder.Api/Services/IAnnotationPageFetcher.cs +++ b/src/TextServices.Builder.Api/Services/IAnnotationPageFetcher.cs @@ -10,5 +10,5 @@ public interface IAnnotationPageFetcher /// or if the resource returns HTTP 404. /// Throws for other failure status codes. /// - Task FetchAsync(string uri, CancellationToken ct = default); + Task FetchAsync(string uri, CancellationToken ct); } diff --git a/src/TextServices.Builder.Api/Services/IManifestFetcher.cs b/src/TextServices.Builder.Api/Services/IManifestFetcher.cs index fb5f1a0..ea03579 100644 --- a/src/TextServices.Builder.Api/Services/IManifestFetcher.cs +++ b/src/TextServices.Builder.Api/Services/IManifestFetcher.cs @@ -12,7 +12,7 @@ public interface IManifestFetcher /// Downloads the manifest at , stores the raw JSON, and /// returns both the JSON string and the reduced page sequence. /// - Task FetchAndReduce(string uri, CancellationToken ct = default); + Task FetchAndReduce(string uri, CancellationToken ct); } /// The raw manifest JSON as returned by the server. diff --git a/src/TextServices.Builder.Api/Services/IResourceFetcher.cs b/src/TextServices.Builder.Api/Services/IResourceFetcher.cs index 0b7646c..bdca91a 100644 --- a/src/TextServices.Builder.Api/Services/IResourceFetcher.cs +++ b/src/TextServices.Builder.Api/Services/IResourceFetcher.cs @@ -15,5 +15,5 @@ public interface IResourceFetcher /// /// s3:// URI was given but S3 is not configured. /// - Task FetchAsync(string uri, CancellationToken ct = default); + Task FetchAsync(string uri, CancellationToken ct); } diff --git a/src/TextServices.Builder.Api/Services/IVttFetcher.cs b/src/TextServices.Builder.Api/Services/IVttFetcher.cs index e6296b8..6058045 100644 --- a/src/TextServices.Builder.Api/Services/IVttFetcher.cs +++ b/src/TextServices.Builder.Api/Services/IVttFetcher.cs @@ -10,5 +10,5 @@ public interface IVttFetcher /// Returns for HTTP 404 (sparse page). /// Throws for other failure conditions. /// - Task FetchAsync(string uri, CancellationToken ct = default); + Task FetchAsync(string uri, CancellationToken ct); } diff --git a/src/TextServices.Builder.Api/Services/ManifestFetcher.cs b/src/TextServices.Builder.Api/Services/ManifestFetcher.cs index 5710bcb..8c36278 100644 --- a/src/TextServices.Builder.Api/Services/ManifestFetcher.cs +++ b/src/TextServices.Builder.Api/Services/ManifestFetcher.cs @@ -2,7 +2,7 @@ namespace TextServices.Builder.Api.Services; public sealed class ManifestFetcher(IResourceFetcher fetcher, IManifestReducer reducer) : IManifestFetcher { - public async Task FetchAndReduce(string uri, CancellationToken ct = default) + public async Task FetchAndReduce(string uri, CancellationToken ct) { await using var stream = await fetcher.FetchAsync(uri, ct) ?? throw new InvalidOperationException($"Manifest not found at: {uri}"); diff --git a/src/TextServices.Builder.Api/Services/ResourceFetcher.cs b/src/TextServices.Builder.Api/Services/ResourceFetcher.cs index 0640481..43ea506 100644 --- a/src/TextServices.Builder.Api/Services/ResourceFetcher.cs +++ b/src/TextServices.Builder.Api/Services/ResourceFetcher.cs @@ -22,8 +22,7 @@ public sealed class ResourceFetcher(IHttpClientFactory httpClientFactory, IAmazo "file" => Task.FromResult(FetchFile(parsed)), "s3" => FetchS3Async(parsed, ct), "http" or "https" => FetchHttpAsync(uri, ct), - var scheme => throw new NotSupportedException( - $"Unsupported URI scheme '{scheme}': {uri}"), + var scheme => throw new NotSupportedException($"Unsupported URI scheme '{scheme}': {uri}"), }; } @@ -44,8 +43,7 @@ public sealed class ResourceFetcher(IHttpClientFactory httpClientFactory, IAmazo { if (s3 is null) { - throw new InvalidOperationException( - $"s3:// URI encountered but IAmazonS3 is not configured: {uri}"); + throw new InvalidOperationException($"s3:// URI encountered but IAmazonS3 is not configured: {uri}"); } var bucket = uri.Host; @@ -53,8 +51,7 @@ public sealed class ResourceFetcher(IHttpClientFactory httpClientFactory, IAmazo try { - var response = await s3.GetObjectAsync( - new GetObjectRequest { BucketName = bucket, Key = key }, ct); + var response = await s3.GetObjectAsync(new GetObjectRequest { BucketName = bucket, Key = key }, ct); return response.ResponseStream; } catch (AmazonS3Exception ex) when (ex.StatusCode == HttpStatusCode.NotFound) diff --git a/src/TextServices.Builder.Api/Services/VttFetcher.cs b/src/TextServices.Builder.Api/Services/VttFetcher.cs index 45cb0a5..d325c7d 100644 --- a/src/TextServices.Builder.Api/Services/VttFetcher.cs +++ b/src/TextServices.Builder.Api/Services/VttFetcher.cs @@ -1,10 +1,8 @@ -using Microsoft.Extensions.Logging; - namespace TextServices.Builder.Api.Services; public sealed class VttFetcher(IResourceFetcher fetcher, ILogger logger) : IVttFetcher { - public async Task FetchAsync(string uri, CancellationToken ct = default) + public async Task FetchAsync(string uri, CancellationToken ct) { logger.LogDebug("Fetching VTT: {Uri}", uri); await using var stream = await fetcher.FetchAsync(uri, ct); diff --git a/src/TextServices.Search.Api/Features/Pdf/PdfGenerationService.cs b/src/TextServices.Search.Api/Features/Pdf/PdfGenerationService.cs index 72b65a6..6930362 100644 --- a/src/TextServices.Search.Api/Features/Pdf/PdfGenerationService.cs +++ b/src/TextServices.Search.Api/Features/Pdf/PdfGenerationService.cs @@ -38,14 +38,14 @@ private async Task GenerateAsync(string id, CancellationToken ct) var text = await textStore.LoadText(id); if (text == null) { - logger.LogWarning("PDF generation: no Text artefact for {Id}", id); + logger.LogWarning("No Text artefact for {Id}", id); return; } var manifestJson = await textStore.LoadManifest(id); if (manifestJson == null) { - logger.LogWarning("PDF generation: no Manifest for {Id}", id); + logger.LogWarning("No Manifest for {Id}", id); return; } From 03581576ca65be684119bbf9a9d05853269d5509 Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Tue, 2 Jun 2026 11:57:27 +0100 Subject: [PATCH 20/21] Remove TODOs (issues created) --- .../Configuration/TextServicesOptions.cs | 8 -------- src/TextServices.Builder.Api/Jobs/TextBuildJob.cs | 12 ------------ src/TextServices.Demo/wwwroot/js/builder.js | 1 - 3 files changed, 21 deletions(-) diff --git a/src/TextServices.Builder.Api/Configuration/TextServicesOptions.cs b/src/TextServices.Builder.Api/Configuration/TextServicesOptions.cs index dcc05ee..a426d98 100644 --- a/src/TextServices.Builder.Api/Configuration/TextServicesOptions.cs +++ b/src/TextServices.Builder.Api/Configuration/TextServicesOptions.cs @@ -29,14 +29,6 @@ public class TextServicesOptions /// /// Maximum number of text files (ALTO, VTT, AnnotationPage) fetched concurrently within a single job. - /// The right value depends on the source: - /// - /// Third-party HTTP (Wellcome, Internet Archive, etc.): 4–8 for politeness. - /// Internal/trusted HTTP: 16–32. - /// S3 (same-region, same-bucket): 64–128 — S3 handles high parallelism well. - /// - /// TODO: When S3 storage is added, consider deriving the limit automatically from - /// the URI scheme/host of the text links, or adding a per-host override table here. /// public int MaxConcurrentPageFetches { get; set; } = 8; diff --git a/src/TextServices.Builder.Api/Jobs/TextBuildJob.cs b/src/TextServices.Builder.Api/Jobs/TextBuildJob.cs index 2d404d3..95b4c2d 100644 --- a/src/TextServices.Builder.Api/Jobs/TextBuildJob.cs +++ b/src/TextServices.Builder.Api/Jobs/TextBuildJob.cs @@ -147,18 +147,6 @@ private async Task> GetPages(BuilderJob job, Canc // Fetch all pages concurrently, bounded by MaxConcurrentPageFetches. // Results are stored into a pre-allocated array so TextBuilder receives // canvases in the correct sequence. - // - // TODO: The right concurrency limit depends on where the text files live. - // - Third-party HTTP (e.g. Wellcome, Internet Archive): keep low (4–8) for - // politeness and to avoid rate-limiting. - // - Internal/trusted HTTP: can be higher (16–32). - // - S3 (s3:// or https://*.s3.amazonaws.com): S3 supports very high - // parallelism on the same bucket; 64–128 is reasonable. When an S3 - // ITextStore is in use the ALTO URIs will typically be pre-signed HTTPS - // URLs or s3:// keys fetched via the AWS SDK — detect by scheme or - // hostname pattern and use a higher limit for those. - // Consider deriving the limit from the scheme/host of pages[0].Text, or - // adding a per-host override table to TextServicesOptions. // pdf-type pages embed an existing PDF; they have no text to build. // Custom-type pages are generated at PDF render time; no text to fetch. diff --git a/src/TextServices.Demo/wwwroot/js/builder.js b/src/TextServices.Demo/wwwroot/js/builder.js index 08386ae..ab7c8bc 100644 --- a/src/TextServices.Demo/wwwroot/js/builder.js +++ b/src/TextServices.Demo/wwwroot/js/builder.js @@ -96,7 +96,6 @@ async function refreshJobs() { let apiJobs = []; let apiReachable = true; try { - // TODO: add paging controls — pageSize=100 is a temporary workaround const res = await fetch(`${config.builderApi}/textbuilder?pageSize=100`); if (res.ok) { const data = await res.json(); From 414a6247db77b2c6842b50a97f9bbf1b688648ce Mon Sep 17 00:00:00 2001 From: Donald Gray Date: Tue, 2 Jun 2026 14:36:35 +0100 Subject: [PATCH 21/21] Centralise Search API URL construction via SearchApiRoutes Hardcoded route-segment interpolations in TextAugmentedHandler and JobResponse are replaced with calls to SearchApiRoutes, a new static class in TextServices.Infrastructure. ResolvedRequest gains extension methods that delegate to SearchApiRoutes, enabling resolved.SearchV1Url() etc. TextAugmentedRequest now accepts ResolvedRequest directly rather than individual URL strings. Co-Authored-By: Claude Sonnet 4.6 --- .../Features/Jobs/JobResponse.cs | 19 ++--- .../SearchApiRoutes.cs | 25 ++++++ .../Features/EndpointHelpers.cs | 13 --- .../Features/ResolvedRequest.cs | 48 +++++++++++ .../TextAugmented/TextAugmentedEndpoints.cs | 2 +- .../TextAugmented/TextAugmentedRequest.cs | 79 +++++++++---------- .../SearchApi/CapabilityGatingTests.cs | 3 +- .../SearchApi/TextAugmentedHandlerTests.cs | 68 ++++++++-------- 8 files changed, 159 insertions(+), 98 deletions(-) create mode 100644 src/TextServices.Infrastructure/SearchApiRoutes.cs create mode 100644 src/TextServices.Search.Api/Features/ResolvedRequest.cs diff --git a/src/TextServices.Builder.Api/Features/Jobs/JobResponse.cs b/src/TextServices.Builder.Api/Features/Jobs/JobResponse.cs index 9850bbb..de27682 100644 --- a/src/TextServices.Builder.Api/Features/Jobs/JobResponse.cs +++ b/src/TextServices.Builder.Api/Features/Jobs/JobResponse.cs @@ -1,5 +1,6 @@ using TextServices.Builder.Api.Configuration; using TextServices.Builder.Api.Data; +using TextServices.Infrastructure; using TextServices.Storage; namespace TextServices.Builder.Api.Features.Jobs; @@ -76,30 +77,30 @@ public static JobResponse From(BuilderJob job, TextServicesOptions options) if (fulfilled.HasFlag(JobServices.Search)) { - searchV1 = $"{baseUrl}/search/v1/{job.Id}"; - searchV2 = $"{baseUrl}/search/v2/{job.Id}"; + searchV1 = SearchApiRoutes.SearchV1(baseUrl, job.Id); + searchV2 = SearchApiRoutes.SearchV2(baseUrl, job.Id); } if (fulfilled.HasFlag(JobServices.Autocomplete)) { - autocompleteV1 = $"{baseUrl}/autocomplete/v1/{job.Id}"; - autocompleteV2 = $"{baseUrl}/autocomplete/v2/{job.Id}"; + autocompleteV1 = SearchApiRoutes.AutocompleteV1(baseUrl, job.Id); + autocompleteV2 = SearchApiRoutes.AutocompleteV2(baseUrl, job.Id); } if (fulfilled.HasFlag(JobServices.FullText)) - fullText = $"{baseUrl}/text/v1/{job.Id}"; + fullText = SearchApiRoutes.FullText(baseUrl, job.Id); if (fulfilled.HasFlag(JobServices.Pdf)) - pdf = $"{baseUrl}/pdf/v1/{job.Id}"; + pdf = SearchApiRoutes.Pdf(baseUrl, job.Id); if (fulfilled.HasFlag(JobServices.TextAugmented)) - textAugmented = $"{baseUrl}/text-augmented/v3/{job.Id}"; + textAugmented = SearchApiRoutes.TextAugmented(baseUrl, job.Id); if (fulfilled.HasFlag(JobServices.Annotations)) - annotations = $"{baseUrl}/annotations/manifest/v1/{job.Id}"; + annotations = SearchApiRoutes.AnnotationsManifest(baseUrl, job.Id); if (fulfilled.HasFlag(JobServices.Figures)) - figures = $"{baseUrl}/identified/figures/{job.Id}"; + figures = SearchApiRoutes.Figures(baseUrl, job.Id); } return new JobResponse diff --git a/src/TextServices.Infrastructure/SearchApiRoutes.cs b/src/TextServices.Infrastructure/SearchApiRoutes.cs new file mode 100644 index 0000000..486ccfb --- /dev/null +++ b/src/TextServices.Infrastructure/SearchApiRoutes.cs @@ -0,0 +1,25 @@ +namespace TextServices.Infrastructure; + +/// +/// Builds absolute URLs for all Search API endpoints. A single place to update if any +/// route segment changes. +/// +public static class SearchApiRoutes +{ + public static string SearchV1(string baseUrl, string id) => $"{baseUrl}/search/v1/{id}"; + public static string SearchV2(string baseUrl, string id) => $"{baseUrl}/search/v2/{id}"; + public static string AutocompleteV1(string baseUrl, string id) => $"{baseUrl}/autocomplete/v1/{id}"; + public static string AutocompleteV2(string baseUrl, string id) => $"{baseUrl}/autocomplete/v2/{id}"; + public static string FullText(string baseUrl, string id) => $"{baseUrl}/text/v1/{id}"; + public static string Pdf(string baseUrl, string id) => $"{baseUrl}/pdf/v1/{id}"; + public static string TextAugmented(string baseUrl, string id) => $"{baseUrl}/text-augmented/v3/{id}"; + public static string AnnotationsManifest(string baseUrl, string id) => $"{baseUrl}/annotations/manifest/v1/{id}"; + + public static string AnnotationsLines(string baseUrl, string id, int canvasIndex) => + $"{baseUrl}/annotations/lines/v1/{canvasIndex}/{id}"; + + public static string AnnotationsWords(string baseUrl, string id, int canvasIndex) => + $"{baseUrl}/annotations/words/v1/{canvasIndex}/{id}"; + + public static string Figures(string baseUrl, string id) => $"{baseUrl}/identified/figures/{id}"; +} diff --git a/src/TextServices.Search.Api/Features/EndpointHelpers.cs b/src/TextServices.Search.Api/Features/EndpointHelpers.cs index bea501e..1298a51 100644 --- a/src/TextServices.Search.Api/Features/EndpointHelpers.cs +++ b/src/TextServices.Search.Api/Features/EndpointHelpers.cs @@ -2,19 +2,6 @@ namespace TextServices.Search.Api.Features; -/// -/// Holds the resolved URL components for a single endpoint request, accounting for -/// X-Forwarded-Host and X-Forwarded-Path proxy headers. -/// -/// -/// Job id to use in generated IIIF URLs. Extracted from X-Forwarded-Path when a -/// whitelisted host is present; otherwise equals the original route id. -/// -/// Absolute URL for this endpoint response (base + route prefix + effective id + query). -/// Absolute URL without query string. Used as the base for child resource IDs (e.g. annotations). -/// Scheme + authority only (no path). Configured URL or based on x-forwarded-host if valid. -internal record ResolvedRequest(string EffectiveId, string SelfUrl, string ResourceUrl, string BaseUrl); - internal static class EndpointHelpers { /// diff --git a/src/TextServices.Search.Api/Features/ResolvedRequest.cs b/src/TextServices.Search.Api/Features/ResolvedRequest.cs new file mode 100644 index 0000000..008e994 --- /dev/null +++ b/src/TextServices.Search.Api/Features/ResolvedRequest.cs @@ -0,0 +1,48 @@ +using TextServices.Infrastructure; + +namespace TextServices.Search.Api.Features; + +/// +/// Holds the resolved URL components for a single endpoint request, accounting for +/// X-Forwarded-Host and X-Forwarded-Path proxy headers. +/// +/// +/// Job id to use in generated IIIF URLs. Extracted from X-Forwarded-Path when a +/// whitelisted host is present; otherwise equals the original route id. +/// +/// Absolute URL for this endpoint response (base + route prefix + effective id + query). +/// Absolute URL without query string. Used as the base for child resource IDs (e.g. annotations). +/// Scheme + authority only (no path). Configured URL or based on x-forwarded-host if valid. +internal record ResolvedRequest(string EffectiveId, string SelfUrl, string ResourceUrl, string BaseUrl); + +/// +/// Extension methods for to ease route generation +/// +internal static class ResolvedRequestExtensions +{ + public static string SearchV1Url(this ResolvedRequest r) => SearchApiRoutes.SearchV1(r.BaseUrl, r.EffectiveId); + public static string SearchV2Url(this ResolvedRequest r) => SearchApiRoutes.SearchV2(r.BaseUrl, r.EffectiveId); + + public static string AutocompleteV1Url(this ResolvedRequest r) => + SearchApiRoutes.AutocompleteV1(r.BaseUrl, r.EffectiveId); + + public static string AutocompleteV2Url(this ResolvedRequest r) => + SearchApiRoutes.AutocompleteV2(r.BaseUrl, r.EffectiveId); + + public static string FullTextUrl(this ResolvedRequest r) => SearchApiRoutes.FullText(r.BaseUrl, r.EffectiveId); + public static string PdfUrl(this ResolvedRequest r) => SearchApiRoutes.Pdf(r.BaseUrl, r.EffectiveId); + + public static string TextAugmentedUrl(this ResolvedRequest r) => + SearchApiRoutes.TextAugmented(r.BaseUrl, r.EffectiveId); + + public static string AnnotationsManifestUrl(this ResolvedRequest r) => + SearchApiRoutes.AnnotationsManifest(r.BaseUrl, r.EffectiveId); + + public static string AnnotationsLinesUrl(this ResolvedRequest r, int i) => + SearchApiRoutes.AnnotationsLines(r.BaseUrl, r.EffectiveId, i); + + public static string AnnotationsWordsUrl(this ResolvedRequest r, int i) => + SearchApiRoutes.AnnotationsWords(r.BaseUrl, r.EffectiveId, i); + + public static string FiguresUrl(this ResolvedRequest r) => SearchApiRoutes.Figures(r.BaseUrl, r.EffectiveId); +} diff --git a/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedEndpoints.cs b/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedEndpoints.cs index 595ceaf..9cf9311 100644 --- a/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedEndpoints.cs +++ b/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedEndpoints.cs @@ -15,7 +15,7 @@ internal static IEndpointRouteBuilder MapTextAugmentedEndpoints(this IEndpointRo HttpContext ctx) => { var resolved = EndpointHelpers.Resolve(options.Value, ctx, "text-augmented/v3/", id); - var result = await sender.Send(new TextAugmentedRequest(id, resolved.SelfUrl, resolved.BaseUrl, UrlId: resolved.EffectiveId)); + var result = await sender.Send(new TextAugmentedRequest(id, resolved)); if (result == null) return Results.NotFound(); return Results.Json(result, contentType: "application/ld+json"); }); diff --git a/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedRequest.cs b/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedRequest.cs index ddb4548..4cc4ef0 100644 --- a/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedRequest.cs +++ b/src/TextServices.Search.Api/Features/TextAugmented/TextAugmentedRequest.cs @@ -15,19 +15,15 @@ namespace TextServices.Search.Api.Features.TextAugmented; /// No @context is emitted inside service blocks — it belongs only at document level. /// /// Storage key used to load artefacts from the text store. -/// Absolute URL for this endpoint response (base + route prefix + effective id + query). -/// -/// Scheme + authority only (no path). Used by TextAugmented to build cross-endpoint service URLs +/// +/// Resolved URL components for this request. is used +/// for generated IIIF service URLs; it may differ from when the request arrived +/// via a proxy that rewrites the path (X-Forwarded-Path). /// -/// -/// Id to use when generating IIIF service URLs. Differs from when the -/// request arrived via a proxy that rewrites the path (X-Forwarded-Path). Defaults to -/// when null. -/// -public record TextAugmentedRequest(string Id, string SelfUrl, string SearchBaseUrl, string? UrlId = null) +internal record TextAugmentedRequest(string Id, ResolvedRequest Resolved) : IRequest; -public class TextAugmentedHandler(ITextStore textStore, ITextCache textCache, ILogger logger) +internal class TextAugmentedHandler(ITextStore textStore, ITextCache textCache, ILogger logger) : IRequestHandler { public async Task Handle(TextAugmentedRequest request, CancellationToken ct) @@ -51,24 +47,23 @@ public class TextAugmentedHandler(ITextStore textStore, ITextCache textCache, IL return null; } - var baseUrl = request.SearchBaseUrl; - var id = request.UrlId ?? request.Id; var text = await textCache.GetTextAsync(request.Id, ct); - ReplaceSelfUrl(manifest, request.SelfUrl); + var resolvedRequestUris = request.Resolved; + ReplaceSelfUrl(manifest, resolvedRequestUris.SelfUrl); if (text != null) { - InjectSearchServices(manifest, baseUrl, id); - InjectRenderingLinks(manifest, baseUrl, id, text); - InjectCanvasAnnotationRefs(manifest, baseUrl, id, text); + InjectSearchServices(manifest, resolvedRequestUris); + InjectRenderingLinks(manifest, resolvedRequestUris, text); + InjectCanvasAnnotationRefs(manifest, resolvedRequestUris, text); } else { logger.LogDebug("Text not found: {Id}", request.Id); } - await InjectManifestAnnotationsRefAsync(manifest, baseUrl, id, request.Id); - await InjectFiguresRefAsync(manifest, baseUrl, id, request.Id); + await InjectManifestAnnotationsRefAsync(manifest, resolvedRequestUris, request.Id); + await InjectFiguresRefAsync(manifest, resolvedRequestUris, request.Id); return manifest; } @@ -81,28 +76,28 @@ private static void ReplaceSelfUrl(JsonObject manifest, string selfUrl) manifest["id"] = selfUrl; } - private void InjectSearchServices(JsonObject manifest, string baseUrl, string id) + private void InjectSearchServices(JsonObject manifest, ResolvedRequest resolved) { - logger.LogDebug("Adding search-services: {Id}", id); + logger.LogDebug("Adding search-services: {Id}", resolved.EffectiveId); var searchServiceV2 = new JsonObject { - ["id"] = $"{baseUrl}/search/v2/{id}", // TODO - can we build these from a central place? + ["id"] = resolved.SearchV2Url(), ["type"] = "SearchService2", ["service"] = new JsonArray(new JsonObject { - ["id"] = $"{baseUrl}/autocomplete/v2/{id}", + ["id"] = resolved.AutocompleteV2Url(), ["type"] = "AutoCompleteService2", }), }; var searchServiceV1 = new JsonObject { - ["id"] = $"{baseUrl}/search/v1/{id}", + ["id"] = resolved.SearchV1Url(), ["type"] = "SearchService1", ["profile"] = "http://iiif.io/api/search/1/search", ["service"] = new JsonArray(new JsonObject { - ["id"] = $"{baseUrl}/autocomplete/v1/{id}", + ["id"] = resolved.AutocompleteV1Url(), ["type"] = "AutoCompleteService1", ["profile"] = "http://iiif.io/api/search/1/autocomplete", }), @@ -133,9 +128,9 @@ private void InjectSearchServices(JsonObject manifest, string baseUrl, string id AppendContext(manifest, "http://iiif.io/api/search/1/context.json"); } - private void InjectRenderingLinks(JsonObject manifest, string baseUrl, string id, Text text) + private void InjectRenderingLinks(JsonObject manifest, ResolvedRequest resolved, Text text) { - logger.LogDebug("Adding rendering: {Id}", id); + logger.LogDebug("Adding rendering: {Id}", resolved.EffectiveId); // PDF is only added when at least one canvas is image-based (IsTemporalContent == false); // temporal-only manifests (VTT/audio/video) have no page images to render into a PDF. @@ -143,7 +138,7 @@ private void InjectRenderingLinks(JsonObject manifest, string baseUrl, string id var textRef = new JsonObject { - ["id"] = $"{baseUrl}/text/v1/{id}", + ["id"] = resolved.FullTextUrl(), ["type"] = "Text", ["label"] = new JsonObject { ["en"] = new JsonArray("View as plain text") }, ["format"] = "text/plain", @@ -152,28 +147,28 @@ private void InjectRenderingLinks(JsonObject manifest, string baseUrl, string id if (manifest["rendering"] is JsonArray existingRendering) { AddIfNew(existingRendering, textRef); - if (hasImageCanvases) AddIfNew(existingRendering, BuildPdfRef(baseUrl, id)); + if (hasImageCanvases) AddIfNew(existingRendering, BuildPdfRef(resolved)); } else if (manifest["rendering"] is JsonObject singleRendering) { var arr = new JsonArray(singleRendering.DeepClone()); AddIfNew(arr, textRef); - if (hasImageCanvases) AddIfNew(arr, BuildPdfRef(baseUrl, id)); + if (hasImageCanvases) AddIfNew(arr, BuildPdfRef(resolved)); manifest["rendering"] = arr; } else { manifest["rendering"] = hasImageCanvases - ? new JsonArray(BuildPdfRef(baseUrl, id), textRef) + ? new JsonArray(BuildPdfRef(resolved), textRef) : new JsonArray(textRef); } } - private void InjectCanvasAnnotationRefs(JsonObject manifest, string baseUrl, string id, Text text) + private void InjectCanvasAnnotationRefs(JsonObject manifest, ResolvedRequest resolved, Text text) { if (manifest["items"] is not JsonArray canvases) return; - logger.LogDebug("Adding canvas annotations: {Id}", id); + logger.LogDebug("Adding canvas annotations: {Id}", resolved.EffectiveId); // Build a set of canvas indices that have at least one word, in O(words). var canvasesWithWords = text.Words.Values.Select(w => w.Idx).ToHashSet(); @@ -185,13 +180,13 @@ private void InjectCanvasAnnotationRefs(JsonObject manifest, string baseUrl, str var linesRef = new JsonObject { - ["id"] = $"{baseUrl}/annotations/lines/v1/{i}/{id}", + ["id"] = resolved.AnnotationsLinesUrl(i), ["type"] = "AnnotationPage", ["label"] = new JsonObject { ["en"] = new JsonArray("Line-level transcription") }, }; var wordsRef = new JsonObject { - ["id"] = $"{baseUrl}/annotations/words/v1/{i}/{id}", + ["id"] = resolved.AnnotationsWordsUrl(i), ["type"] = "AnnotationPage", ["label"] = new JsonObject { ["en"] = new JsonArray("Word-level transcription") }, }; @@ -215,16 +210,16 @@ private void InjectCanvasAnnotationRefs(JsonObject manifest, string baseUrl, str } } - private async Task InjectManifestAnnotationsRefAsync(JsonObject manifest, string baseUrl, string id, string key) + private async Task InjectManifestAnnotationsRefAsync(JsonObject manifest, ResolvedRequest resolved, string key) { var annotationsJson = await textStore.LoadAnnotations(key); if (annotationsJson == null) return; - logger.LogDebug("Adding Manifest annotations: {Id}", id); + logger.LogDebug("Adding Manifest annotations: {Id}", resolved.EffectiveId); var annotationsRef = new JsonObject { - ["id"] = $"{baseUrl}/annotations/manifest/v1/{id}", + ["id"] = resolved.AnnotationsManifestUrl(), ["type"] = "AnnotationPage", ["profile"] = "https://dlcs.io/profiles/all-text", ["label"] = new JsonObject { ["en"] = new JsonArray("Text of all canvases") }, @@ -246,16 +241,16 @@ private async Task InjectManifestAnnotationsRefAsync(JsonObject manifest, string } } - private async Task InjectFiguresRefAsync(JsonObject manifest, string baseUrl, string id, string key) + private async Task InjectFiguresRefAsync(JsonObject manifest, ResolvedRequest resolved, string key) { var figuresJson = await textStore.LoadFigures(key); if (figuresJson == null) return; - logger.LogDebug("Adding figures: {Id}", id); + logger.LogDebug("Adding figures: {Id}", resolved.EffectiveId); var figuresRef = new JsonObject { - ["id"] = $"{baseUrl}/identified/figures/{id}", + ["id"] = resolved.FiguresUrl(), ["type"] = "AnnotationPage", ["label"] = new JsonObject { @@ -303,9 +298,9 @@ private static void AppendContext(JsonObject manifest, string url) } } - private static JsonObject BuildPdfRef(string baseUrl, string id) => new() + private static JsonObject BuildPdfRef(ResolvedRequest resolved) => new() { - ["id"] = $"{baseUrl}/pdf/v1/{id}", + ["id"] = resolved.PdfUrl(), ["type"] = "Text", ["label"] = new JsonObject { ["en"] = new JsonArray("Download as PDF") }, ["format"] = "application/pdf", diff --git a/src/TextServices.Tests/SearchApi/CapabilityGatingTests.cs b/src/TextServices.Tests/SearchApi/CapabilityGatingTests.cs index 88bb26f..ae33940 100644 --- a/src/TextServices.Tests/SearchApi/CapabilityGatingTests.cs +++ b/src/TextServices.Tests/SearchApi/CapabilityGatingTests.cs @@ -1,6 +1,7 @@ using Microsoft.Extensions.Logging.Abstractions; using Shouldly; using TextServices.Core.Models; +using TextServices.Search.Api.Features; using TextServices.Search.Api.Features.Annotations; using TextServices.Search.Api.Features.Autocomplete; using TextServices.Search.Api.Features.Figures; @@ -164,7 +165,7 @@ public async Task TextAugmented_WhenTextAugmentedFlagAbsent_ReturnsNull() new NullLogger()); var result = await handler.Handle( - new TextAugmentedRequest(Id, SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest(Id, new ResolvedRequest(Id, SelfUrl, SelfUrl, SearchBase)), CancellationToken.None); result.ShouldBeNull(); } diff --git a/src/TextServices.Tests/SearchApi/TextAugmentedHandlerTests.cs b/src/TextServices.Tests/SearchApi/TextAugmentedHandlerTests.cs index 23e23f8..35f3e99 100644 --- a/src/TextServices.Tests/SearchApi/TextAugmentedHandlerTests.cs +++ b/src/TextServices.Tests/SearchApi/TextAugmentedHandlerTests.cs @@ -3,6 +3,7 @@ using Shouldly; using TextServices.Core.Models; using TextServices.Core.Providers; +using TextServices.Search.Api.Features; using TextServices.Search.Api.Features.TextAugmented; using TextServices.Search.Api.Services; using TextServices.Storage; @@ -30,7 +31,7 @@ public async Task Handle_ManifestNotFound_ReturnsNull() var handler = MakeHandler(null); var result = await handler.Handle( - new TextAugmentedRequest("missing/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("missing/book", DefaultResolved("missing/book")), CancellationToken.None); result.ShouldBeNull(); } @@ -41,7 +42,7 @@ public async Task Handle_NoText_ReturnsManifestWithoutSearchServicesOrContext() var handler = MakeHandler(V3Manifest(), noText: true); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); result.ShouldNotBeNull(); result["service"].ShouldBeNull(); @@ -58,7 +59,7 @@ public async Task Handle_V3Manifest_ReplacesIdWithSelfUrl() var handler = MakeHandler(V3Manifest("https://original.example.org/manifest/1")); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); result.ShouldNotBeNull(); result!["id"]!.GetValue().ShouldBe(SelfUrl); @@ -70,7 +71,7 @@ public async Task Handle_V2Manifest_ReplacesAtIdWithSelfUrl() var handler = MakeHandler(V2Manifest("https://original.example.org/manifest/1")); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); result.ShouldNotBeNull(); result!["@id"]!.GetValue().ShouldBe(SelfUrl); @@ -86,7 +87,7 @@ public async Task Handle_NoExistingService_CreatesTwoServiceEntries() var handler = MakeHandler(V3Manifest()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var service = result!["service"].ShouldBeOfType(); service.Count.ShouldBe(2); // v2 + v1 @@ -98,7 +99,7 @@ public async Task Handle_ExistingServiceArray_PrependsSearchServices() var handler = MakeHandler(V3ManifestWithServiceArray()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var service = result!["service"].ShouldBeOfType(); service.Count.ShouldBe(3); // v2 + v1 + original @@ -110,7 +111,7 @@ public async Task Handle_ExistingServiceObject_PromotesToArrayWithSearchFirst() var handler = MakeHandler(V3ManifestWithServiceObject()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var service = result!["service"].ShouldBeOfType(); service.Count.ShouldBe(3); // v2 + v1 + original @@ -123,7 +124,7 @@ public async Task Handle_ExistingServiceArray_SearchServicesAreFirst() var handler = MakeHandler(V3ManifestWithServiceArray()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var service = result!["service"].ShouldBeOfType(); service[0]!["id"]!.GetValue().ShouldBe(ExpectedSearchV2); @@ -137,7 +138,7 @@ public async Task Handle_SearchServiceV2_HasCorrectTypeAndId() var handler = MakeHandler(V3Manifest()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var searchServiceV2 = result!["service"]![0]!; searchServiceV2["id"]!.GetValue().ShouldBe(ExpectedSearchV2); @@ -150,7 +151,7 @@ public async Task Handle_SearchServiceV1_HasCorrectTypeProfileAndId() var handler = MakeHandler(V3Manifest()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var searchServiceV1 = result!["service"]![1]!; searchServiceV1["id"]!.GetValue().ShouldBe(ExpectedSearchV1); @@ -164,7 +165,7 @@ public async Task Handle_AutocompleteServiceV2_NestedInsideSearchServiceV2() var handler = MakeHandler(V3Manifest()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var autocomplete = result!["service"]![0]!["service"]![0]!; autocomplete["id"]!.GetValue().ShouldBe(ExpectedAutocompleteV2); @@ -177,7 +178,7 @@ public async Task Handle_AutocompleteServiceV1_NestedInsideSearchServiceV1() var handler = MakeHandler(V3Manifest()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var autocomplete = result!["service"]![1]!["service"]![0]!; autocomplete["id"]!.GetValue().ShouldBe(ExpectedAutocompleteV1); @@ -195,7 +196,7 @@ public async Task Handle_ImageBasedContent_AddsPdfAndPlainTextRenderingLinks() var handler = MakeHandler(V3Manifest(), cachedText: SpatialText()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var rendering = result!["rendering"].ShouldBeOfType(); rendering.Count.ShouldBe(2); @@ -211,7 +212,7 @@ public async Task Handle_TemporalOnlyContent_AddsPlainTextButNoPdfLink() var handler = MakeHandler(V3Manifest(), cachedText: TemporalText()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var rendering = result!["rendering"].ShouldBeOfType(); rendering.Count.ShouldBe(1); @@ -225,7 +226,7 @@ public async Task Handle_TextNotBuilt_NoRenderingLinks() var handler = MakeHandler(V3Manifest(), noText: true); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); result!["rendering"].ShouldBeNull(); } @@ -236,7 +237,7 @@ public async Task Handle_ImageBasedContent_RenderingLinksArePrependedToExisting( var handler = MakeHandler(V3ManifestWithRendering(), cachedText: SpatialText()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var rendering = result!["rendering"].ShouldBeOfType(); rendering.Count.ShouldBe(3); // PDF + plain text + original @@ -250,7 +251,7 @@ public async Task Handle_TemporalOnlyContent_PlainTextLinkPrependedToExisting() var handler = MakeHandler(V3ManifestWithRendering(), cachedText: TemporalText()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var rendering = result!["rendering"].ShouldBeOfType(); rendering.Count.ShouldBe(2); // plain text + original (no PDF) @@ -272,7 +273,7 @@ public async Task Handle_WithAnnotations_InjectsManifestAnnotationsRef() var handler = MakeHandler(V3Manifest(), annotationsJson: annotationsJson); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var annotations = result!["annotations"].ShouldBeOfType(); var ref0 = annotations[0]!; @@ -291,7 +292,7 @@ public async Task Handle_WithAnnotationsAndFigures_AnnotationsRefBeforeFiguresRe var handler = MakeHandler(V3Manifest(), annotationsJson: annotationsJson, figuresJson: figuresJson); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var annotations = result!["annotations"].ShouldBeOfType(); annotations.Count.ShouldBe(2); @@ -305,7 +306,7 @@ public async Task Handle_WithoutAnnotations_NoManifestAnnotationsRef() var handler = MakeHandler(V3Manifest()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); result!["annotations"].ShouldBeNull(); } @@ -320,7 +321,7 @@ public async Task Handle_ServiceArrayAlreadyContainsSearchV2_DoesNotDuplicateV2( var handler = MakeHandler(V3ManifestWithExistingSearchV2()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var service = result!["service"].ShouldBeOfType(); service.Count.ShouldBe(2); // original v2 stays; v1 added; no duplicate v2 @@ -334,7 +335,7 @@ public async Task Handle_ServiceArrayAlreadyContainsBothSearchServices_NeitherDu var handler = MakeHandler(V3ManifestWithBothSearchServices()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var service = result!["service"].ShouldBeOfType(); service.Count.ShouldBe(2); // unchanged — both already present @@ -348,7 +349,7 @@ public async Task Handle_ServiceObjectMatchesSearchV2_PromotesToArrayWithoutDupl var handler = MakeHandler(V3ManifestWithSearchV2ServiceObject()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var service = result!["service"].ShouldBeOfType(); service.Count.ShouldBe(2); // v1 added; original v2 kept; no duplicate v2 @@ -366,7 +367,7 @@ public async Task Handle_RenderingArrayAlreadyContainsTextRef_DoesNotDuplicateTe var handler = MakeHandler(V3ManifestWithExistingTextRef(), cachedText: SpatialText()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var rendering = result!["rendering"].ShouldBeOfType(); rendering.Count(n => n?["id"]?.GetValue() == ExpectedRawTextUrl).ShouldBe(1); @@ -378,7 +379,7 @@ public async Task Handle_RenderingArrayAlreadyContainsPdfRef_DoesNotDuplicatePdf var handler = MakeHandler(V3ManifestWithExistingPdfRef(), cachedText: SpatialText()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var rendering = result!["rendering"].ShouldBeOfType(); rendering.Count(n => n?["id"]?.GetValue() == ExpectedPdfUrl).ShouldBe(1); @@ -390,7 +391,7 @@ public async Task Handle_RenderingArrayAlreadyContainsBothRefs_NeitherDuplicated var handler = MakeHandler(V3ManifestWithExistingPdfAndTextRefs(), cachedText: SpatialText()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var rendering = result!["rendering"].ShouldBeOfType(); rendering.Count.ShouldBe(2); // unchanged @@ -409,7 +410,7 @@ public async Task Handle_AnnotationsArrayAlreadyContainsAnnotationsRef_DoesNotDu var handler = MakeHandler(V3ManifestWithExistingAnnotationsRef(), annotationsJson: annotationsJson); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var annotations = result!["annotations"].ShouldBeOfType(); annotations.Count(n => n?["id"]?.GetValue() == @@ -423,7 +424,7 @@ public async Task Handle_AnnotationsArrayAlreadyContainsFiguresRef_DoesNotDuplic var handler = MakeHandler(V3ManifestWithExistingFiguresRef(), figuresJson: figuresJson); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var annotations = result!["annotations"].ShouldBeOfType(); annotations.Count(n => n?["id"]?.GetValue() == @@ -440,7 +441,7 @@ public async Task Handle_NoContext_AddsSearchContexts() var handler = MakeHandler("""{"id":"https://example.org/m/1","type":"Manifest"}"""); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var ctx = result!["@context"].ShouldBeOfType(); ctx.Select(n => n!.GetValue()).ShouldBe( @@ -456,7 +457,7 @@ public async Task Handle_StringContext_PromotesToArrayAndAppendsSearchContexts() var handler = MakeHandler(V3Manifest()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var ctx = result!["@context"].ShouldBeOfType(); ctx.Select(n => n!.GetValue()).ShouldBe( @@ -473,7 +474,7 @@ public async Task Handle_ArrayContext_AppendsSearchContexts() var handler = MakeHandler("""{"id":"https://example.org/m/1","type":"Manifest","@context":["http://iiif.io/api/presentation/3/context.json"]}"""); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var ctx = result!["@context"].ShouldBeOfType(); ctx.Select(n => n!.GetValue()).ShouldBe( @@ -490,13 +491,16 @@ public async Task Handle_ContextAlreadyContainsSearchUrls_DoesNotDuplicate() var handler = MakeHandler(V3ManifestWithSearchContexts()); var result = await handler.Handle( - new TextAugmentedRequest("test/book", SelfUrl, SearchBase), CancellationToken.None); + new TextAugmentedRequest("test/book", DefaultResolved()), CancellationToken.None); var ctx = result!["@context"].ShouldBeOfType(); ctx.Count(n => n!.GetValue() == "http://iiif.io/api/search/2/context.json").ShouldBe(1); ctx.Count(n => n!.GetValue() == "http://iiif.io/api/search/1/context.json").ShouldBe(1); } + private static ResolvedRequest DefaultResolved(string id = "test/book") + => new(id, SelfUrl, SelfUrl, SearchBase); + private static TextAugmentedHandler MakeHandler( string? manifestJson, string? annotationsJson = null,