Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
b929458
Deduplicate and refactor TextAugmentedHandler
donaldgray Jun 1, 2026
6d7ee1e
Add logging to TextAugmentedHandler
donaldgray Jun 1, 2026
b174c43
Add logging + IDE formatting
donaldgray Jun 1, 2026
fbd1b7e
Add structured logging to storage and text format providers
donaldgray Jun 1, 2026
af7e027
Validate job ID must not start with '/'
donaldgray Jun 1, 2026
f39bb6a
Add example appsettings
donaldgray Jun 1, 2026
79ca240
Exclude dev appsettings from gitignore
donaldgray Jun 1, 2026
90354f4
Add S3 support to Search API, align storage config structure
donaldgray Jun 1, 2026
f4acca9
Centralise user-agent strings and add AWS startup logging
donaldgray Jun 1, 2026
cd27bad
Fixup e2e tests to use appsettings.test.json
donaldgray Jun 1, 2026
a8f6edb
Move test projects to sln folder
donaldgray Jun 1, 2026
c5bc012
Add AWSSDK.Security token nuget package
donaldgray Jun 1, 2026
def6a36
Remove unnecessary logging
donaldgray Jun 1, 2026
88753f7
Make catch-all route ID parameters required
donaldgray Jun 1, 2026
aff80df
Inject IIIF Search @context and gate services on text existence
donaldgray Jun 1, 2026
205e443
Improve hangfire logging
donaldgray Jun 1, 2026
37192fd
Rename file to match type
donaldgray Jun 2, 2026
a63dc00
Downgrade mediatr
donaldgray Jun 2, 2026
a88ee57
Extra logging, make CancellationTokens required
donaldgray Jun 2, 2026
0358157
Remove TODOs (issues created)
donaldgray Jun 2, 2026
414a624
Centralise Search API URL construction via SearchApiRoutes
donaldgray Jun 2, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ csharp_space_after_cast = false
csharp_prefer_braces = when_multiline:warning
csharp_style_expression_bodied_methods = false:suggestion
csharp_style_expression_bodied_properties = true:suggestion
csharp_method_or_operator_body = expression_body:suggestion
dotnet_style_qualification_for_field = false:warning
dotnet_style_qualification_for_property = false:warning

Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -436,3 +436,6 @@ FodyWeavers.xsd
*.msix
*.msm
*.msp

appsettings.Development.json
appsettings.Development*
24 changes: 17 additions & 7 deletions docs/builder-api.md
Original file line number Diff line number Diff line change
Expand Up @@ -385,14 +385,21 @@ Builder API configuration lives under the `TextServices` key in `appsettings.jso

```json
{
"RunMigrations": true,
"ConnectionStrings": {
"BuilderDb": "Host=localhost;Database=textservices;Username=...;Password=..."
},
"TextServices": {
"SearchApiBaseUrl": "https://search.example.org",
"AllowFileImageProxy": false,
"MaxConcurrentPageFetches": 8,
"Storage": {
"RootPath": "/data/textservices"
"FileSystem": {
"RootPath": "/data/textservices"
}
},
"Notifications": {
"TopicArn": ""
}
},
"CorsAllowedOrigins": ["https://viewer.example.org"]
Expand All @@ -401,13 +408,16 @@ Builder API configuration lives under the `TextServices` key in `appsettings.jso

| Setting | Default | Description |
|---|---|---|
| `RunMigrations` | `false` | When `true`, applies any pending EF Core database migrations on startup. Convenient for dev and simple deployments; leave `false` and run migrations separately in production. |
| `ConnectionStrings:BuilderDb` | _(required)_ | PostgreSQL connection string. Used for both EF Core (job state) and Hangfire (job queue). |
| `SearchApiBaseUrl` | `""` | Public base URL of the Search API. Used to populate the `searchV1` / `searchV2` fields in completed job responses, and to construct `/proxy/image` URLs in synthesised Manifests when `sourceData` pages supply `file://` or `s3://` imageUri values. Leave empty if the Search API is not yet deployed. |
| `MaxConcurrentPageFetches` | `8` | Maximum number of text files fetched in parallel within a single job. Increase for internal or S3 sources; keep low (4–8) for third-party HTTP hosts. |
| `ReportBatchProgress` | `true` | When `true`, `PagesCompleted` is flushed to the database every 10 pages during processing, allowing `GET /textbuilder/{id}` to reflect live progress. Set to `false` to reduce database writes on large manifests; the final count is still persisted when the job completes. |
| `Storage:RootPath` | `textservices-data` | Root directory for stored text artefacts. Ignored when S3 storage is configured. Must be readable by the Search API if used. |
| `Storage:S3:BucketName` | `""` | S3 bucket for stored artefacts. When set, the S3 store is used instead of the filesystem store. |
| `Storage:S3:KeyPrefix` | `""` | Optional prefix for all S3 object keys (e.g. `"textservices/"`). A trailing `/` is added automatically if omitted. |
| `CorsAllowedOrigins` | `[]` | Allowed CORS origins. Empty array disables CORS. |
| `TextServices:SearchApiBaseUrl` | `""` | Public base URL of the Search API. Used to populate the `searchV1` / `searchV2` fields in completed job responses, and to construct `/proxy/image` URLs in synthesised Manifests when `sourceData` pages supply `file://` or `s3://` imageUri values. Leave empty if the Search API is not yet deployed. |
| `TextServices:AllowFileImageProxy` | `false` | When `true`, synthesised Manifests embed `/proxy/image` proxy URLs for `file://` and `s3://` imageUri values instead of omitting the painting annotation. Only enable in trusted local-dev environments. The Search API's `AllowFileImageProxy` must also be `true` for those proxy URLs to serve real content. |
| `TextServices:MaxConcurrentPageFetches` | `8` | Maximum number of text files fetched in parallel within a single job. Increase for internal or S3 sources; keep low (4–8) for third-party HTTP hosts. |
| `TextServices:ReportBatchProgress` | `true` | When `true`, `PagesCompleted` is flushed to the database every 10 pages during processing, allowing `GET /textbuilder/{id}` to reflect live progress. Set to `false` to reduce database writes on large manifests; the final count is still persisted when the job completes. |
| `TextServices:Storage:FileSystem:RootPath` | `textservices-data` | Root directory for stored text artefacts. Ignored when S3 storage is configured. Must be readable by the Search API if used. |
| `TextServices:Storage:S3:BucketName` | `""` | S3 bucket for stored artefacts. When set, the S3 store is used instead of the filesystem store. |
| `TextServices:Storage:S3:KeyPrefix` | `""` | Optional prefix for all S3 object keys (e.g. `"textservices/"`). A trailing `/` is added automatically if omitted. |
| `TextServices:Notifications:TopicArn` | `""` | ARN of an SNS topic to publish a notification to when a job completes (success or failure). Leave empty to disable notifications. |

S3 credentials and region are resolved by the standard AWS SDK credential chain (environment variables, instance profile, `appsettings.json` `AWS` section). Configure the region via the `AWS:Region` key or the `AWS_DEFAULT_REGION` environment variable.
26 changes: 16 additions & 10 deletions docs/search-api.md
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,11 @@ Search API configuration lives under the `TextServices` key in `appsettings.json
"CacheSlidingExpirationMinutes": 30,
"CacheAbsoluteExpirationHours": 4,
"CacheMaxEntries": 20,
"StorageRootPath": "/data/textservices",
"Storage": {
"FileSystem": {
"RootPath": "/data/textservices"
}
},
"PdfTriggerQueueCapacity": 50,
"PdfTriggerMaxConcurrency": 2
}
Expand All @@ -510,15 +514,17 @@ Search API configuration lives under the `TextServices` key in `appsettings.json

| Setting | Default | Description |
|---|---|---|
| `BaseUrl` | `""` | Public base URL of this API. Required when running behind a reverse proxy; without it, self-referencing URLs in responses will use the incoming `Host` header, which may be internal. |
| `CacheSlidingExpirationMinutes` | `30` | How long a text object stays in the memory cache after its last access. |
| `CacheAbsoluteExpirationHours` | `4` | Hard upper limit on cache lifetime, regardless of access frequency. Prevents large objects from living in the LOH indefinitely. |
| `CacheMaxEntries` | `20` | Maximum number of Text (and AutoComplete) objects held in memory simultaneously. Each object counts as one slot; LRU eviction applies when the limit is reached. Budget approximately 30–40 MB per large text when sizing container memory. |
| `StorageRootPath` | `textservices-data` | Root directory of the text artefact store. Must point to the same location as the Builder API's `Storage:RootPath`. |
| `PdfTriggerQueueCapacity` | `50` | Maximum number of PDF trigger requests that can be queued for background generation. Requests beyond this limit receive `503 Service Unavailable`. |
| `PdfTriggerMaxConcurrency` | `2` | Maximum number of PDFs generated concurrently by the background trigger queue. Each in-flight generation buffers the full PDF in memory — keep this low on memory-constrained hosts. |
| `AllowFileImageProxy` | `false` | When `true`, the `/proxy/image` endpoint streams local `file://` images. Only enable in trusted local-dev environments where those files are not access-controlled. |
| `AllowedCustomHosts` | `[]` | Hostnames accepted from the `X-Forwarded-Host` request header (e.g. custom CloudFront distributions). See [Forwarded-header URL rewriting](#forwarded-header-url-rewriting) below. |
| `TextServices:BaseUrl` | `""` | Public base URL of this API. Required when running behind a reverse proxy; without it, self-referencing URLs in responses will use the incoming `Host` header, which may be internal. |
| `TextServices:CacheSlidingExpirationMinutes` | `30` | How long a text object stays in the memory cache after its last access. |
| `TextServices:CacheAbsoluteExpirationHours` | `4` | Hard upper limit on cache lifetime, regardless of access frequency. Prevents large objects from living in the LOH indefinitely. |
| `TextServices:CacheMaxEntries` | `20` | Maximum number of Text (and AutoComplete) objects held in memory simultaneously. Each object counts as one slot; LRU eviction applies when the limit is reached. Budget approximately 30–40 MB per large text when sizing container memory. |
| `TextServices:Storage:FileSystem:RootPath` | `textservices-data` | Root directory of the text artefact store. Must match the Builder API's `TextServices:Storage:FileSystem:RootPath`. Ignored when S3 storage is configured. |
| `TextServices:Storage:S3:BucketName` | `""` | S3 bucket for stored artefacts. When set, the S3 store is used instead of the filesystem store. Must match the Builder API's `TextServices:Storage:S3:BucketName`. |
| `TextServices:Storage:S3:KeyPrefix` | `""` | Optional prefix for all S3 object keys (e.g. `"textservices/"`). A trailing `/` is added automatically if omitted. Must match the Builder API's `TextServices:Storage:S3:KeyPrefix`. |
| `TextServices:PdfTriggerQueueCapacity` | `50` | Maximum number of PDF trigger requests that can be queued for background generation. Requests beyond this limit receive `503 Service Unavailable`. |
| `TextServices:PdfTriggerMaxConcurrency` | `2` | Maximum number of PDFs generated concurrently by the background trigger queue. Each in-flight generation buffers the full PDF in memory — keep this low on memory-constrained hosts. |
| `TextServices:AllowFileImageProxy` | `false` | When `true`, the `/proxy/image` endpoint streams local `file://` images. Only enable in trusted local-dev environments where those files are not access-controlled. |
| `TextServices:AllowedCustomHosts` | `[]` | Hostnames accepted from the `X-Forwarded-Host` request header (e.g. custom CloudFront distributions). See [Forwarded-header URL rewriting](#forwarded-header-url-rewriting) below. |

---

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@
using Hangfire;
using Hangfire.PostgreSql;
using Microsoft.Extensions.Options;
using Serilog;
using Serilog.Extensions.Logging;
using TextServices.Builder.Api.Services;
using TextServices.Builder.Api.Services.Notifications;
using TextServices.Infrastructure.Http;
using TextServices.Storage;

namespace TextServices.Builder.Api.Configuration;
Expand Down Expand Up @@ -53,7 +56,7 @@ public static IServiceCollection AddFetchingServices(this IServiceCollection ser
{
services.AddHttpClient("Resource", client =>
{
client.DefaultRequestHeaders.UserAgent.ParseAdd("TextServices/1.0");
client.DefaultRequestHeaders.UserAgent.ParseAdd(UserAgents.Builder);
client.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/json"));
client.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/ld+json", 0.9));
client.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("*/*", 0.8));
Expand Down Expand Up @@ -81,7 +84,9 @@ public static IServiceCollection AddTextStorage(this IServiceCollection services
if (!string.IsNullOrEmpty(configuration["TextServices:Storage:S3:BucketName"]))
return ActivatorUtilities.CreateInstance<S3TextStore>(sp);
var storage = sp.GetRequiredService<IOptions<TextServicesOptions>>().Value.Storage;
return new FileSystemTextStore(new FileSystemTextStoreOptions { RootPath = storage.RootPath });
return ActivatorUtilities.CreateInstance<FileSystemTextStore>(
sp,
new FileSystemTextStoreOptions { RootPath = storage.FileSystem.RootPath });
});

return services;
Expand Down
29 changes: 19 additions & 10 deletions src/TextServices.Builder.Api/Configuration/TextServicesOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ public class TextServicesOptions
/// </summary>
public string SearchApiBaseUrl { get; set; } = string.Empty;


/// <summary>
/// Allow the Search API's <c>/proxy/image</c> endpoint to serve <c>file://</c> image URIs
/// supplied in <c>sourceData</c> pages.
Expand All @@ -26,18 +25,10 @@ public class TextServicesOptions
/// referenced by <c>imageUri</c> are not access-controlled.
/// </para>
/// </summary>
public bool AllowFileImageProxy { get; set; } = false;
public bool AllowFileImageProxy { get; set; }

/// <summary>
/// Maximum number of text files (ALTO, VTT, AnnotationPage) fetched concurrently within a single job.
/// The right value depends on the source:
/// <list type="bullet">
/// <item>Third-party HTTP (Wellcome, Internet Archive, etc.): 4–8 for politeness.</item>
/// <item>Internal/trusted HTTP: 16–32.</item>
/// <item>S3 (same-region, same-bucket): 64–128 — S3 handles high parallelism well.</item>
/// </list>
/// TODO: When S3 storage is added, consider deriving the limit automatically from
/// the URI scheme/host of the text links, or adding a per-host override table here.
/// </summary>
public int MaxConcurrentPageFetches { get; set; } = 8;

Expand All @@ -57,11 +48,29 @@ public class TextServicesOptions
}

public class StorageOptions
{
/// <summary>Options for the filesystem text store.</summary>
public FileSystemStorageOptions FileSystem { get; set; } = new();

/// <summary>Options for S3 storage. When <see cref="S3StorageOptions.BucketName"/> is set, S3 is used instead of the filesystem.</summary>
public S3StorageOptions S3 { get; set; } = new();
}

public class FileSystemStorageOptions
{
/// <summary>Root path under which text artefacts are stored on the filesystem.</summary>
public string RootPath { get; set; } = "textservices-data";
}

public class S3StorageOptions
{
/// <summary>S3 bucket for stored artefacts.</summary>
public string BucketName { get; set; } = string.Empty;

/// <summary>Optional prefix for all S3 object keys (e.g. "textservices/"). A trailing / is added automatically if omitted.</summary>
public string KeyPrefix { get; set; } = string.Empty;
}

public class NotificationsOptions
{
/// <summary>
Expand Down
3 changes: 1 addition & 2 deletions src/TextServices.Builder.Api/Features/Jobs/DeleteJob.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ public async Task<bool> Handle(DeleteJobRequest request, CancellationToken ct)
var job = await db.Jobs.FindAsync([request.Id], ct);
if (job == null) return false;

if (job.HangfireJobId != null)
hangfire.Delete(job.HangfireJobId);
if (job.HangfireJobId != null) hangfire.Delete(job.HangfireJobId);

await textStore.DeleteArtefacts(job.Id);

Expand Down
6 changes: 3 additions & 3 deletions src/TextServices.Builder.Api/Features/Jobs/JobEndpoints.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,13 @@ internal static IEndpointRouteBuilder MapJobEndpoints(this IEndpointRouteBuilder
return Results.Ok(result);
});

routes.MapGet("/textbuilder/{**id}", async (string id, ISender sender) =>
routes.MapGet("/textbuilder/{*id:minlength(1)}", async (string id, ISender sender) =>
{
var response = await sender.Send(new GetJobRequest(id));
return response == null ? Results.NotFound() : Results.Ok(response);
});

routes.MapPut("/textbuilder/{**id}", async (string id, ISender sender) =>
routes.MapPut("/textbuilder/{*id:minlength(1)}", async (string id, ISender sender) =>
{
var result = await sender.Send(new ReprocessJobRequest(id));
return result.Status switch
Expand All @@ -52,7 +52,7 @@ internal static IEndpointRouteBuilder MapJobEndpoints(this IEndpointRouteBuilder
};
});

routes.MapDelete("/textbuilder/{**id}", async (string id, ISender sender) =>
routes.MapDelete("/textbuilder/{*id:minlength(1)}", async (string id, ISender sender) =>
{
var found = await sender.Send(new DeleteJobRequest(id));
return found ? Results.NoContent() : Results.NotFound();
Expand Down
11 changes: 9 additions & 2 deletions src/TextServices.Builder.Api/Features/Jobs/JobInstruction.cs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ public class JobInstruction : IValidatableObject

public IEnumerable<ValidationResult> Validate(ValidationContext validationContext)
{
if (Id.StartsWith('/'))
{
yield return new ValidationResult(
"Job ID must not start with '/'.",
[nameof(Id)]);
}

var hasUri = !string.IsNullOrWhiteSpace(SourceUri);
var hasData = SourceData is { Count: > 0 };

Expand All @@ -63,9 +70,9 @@ public IEnumerable<ValidationResult> Validate(ValidationContext validationContex
[nameof(SourceUri), nameof(SourceData)]);
}

if (SourceData != null)
if (hasData)
{
foreach (var page in SourceData)
foreach (var page in SourceData!)
{
if (!string.Equals(page.Type, "pdf", StringComparison.OrdinalIgnoreCase)
&& string.IsNullOrEmpty(page.Id))
Expand Down
19 changes: 10 additions & 9 deletions src/TextServices.Builder.Api/Features/Jobs/JobResponse.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using TextServices.Builder.Api.Configuration;
using TextServices.Builder.Api.Data;
using TextServices.Infrastructure;
using TextServices.Storage;

namespace TextServices.Builder.Api.Features.Jobs;
Expand Down Expand Up @@ -76,30 +77,30 @@ public static JobResponse From(BuilderJob job, TextServicesOptions options)

if (fulfilled.HasFlag(JobServices.Search))
{
searchV1 = $"{baseUrl}/search/v1/{job.Id}";
searchV2 = $"{baseUrl}/search/v2/{job.Id}";
searchV1 = SearchApiRoutes.SearchV1(baseUrl, job.Id);
searchV2 = SearchApiRoutes.SearchV2(baseUrl, job.Id);
}

if (fulfilled.HasFlag(JobServices.Autocomplete))
{
autocompleteV1 = $"{baseUrl}/autocomplete/v1/{job.Id}";
autocompleteV2 = $"{baseUrl}/autocomplete/v2/{job.Id}";
autocompleteV1 = SearchApiRoutes.AutocompleteV1(baseUrl, job.Id);
autocompleteV2 = SearchApiRoutes.AutocompleteV2(baseUrl, job.Id);
}

if (fulfilled.HasFlag(JobServices.FullText))
fullText = $"{baseUrl}/text/v1/{job.Id}";
fullText = SearchApiRoutes.FullText(baseUrl, job.Id);

if (fulfilled.HasFlag(JobServices.Pdf))
pdf = $"{baseUrl}/pdf/v1/{job.Id}";
pdf = SearchApiRoutes.Pdf(baseUrl, job.Id);

if (fulfilled.HasFlag(JobServices.TextAugmented))
textAugmented = $"{baseUrl}/text-augmented/v3/{job.Id}";
textAugmented = SearchApiRoutes.TextAugmented(baseUrl, job.Id);

if (fulfilled.HasFlag(JobServices.Annotations))
annotations = $"{baseUrl}/annotations/manifest/v1/{job.Id}";
annotations = SearchApiRoutes.AnnotationsManifest(baseUrl, job.Id);

if (fulfilled.HasFlag(JobServices.Figures))
figures = $"{baseUrl}/identified/figures/{job.Id}";
figures = SearchApiRoutes.Figures(baseUrl, job.Id);
}

return new JobResponse
Expand Down
14 changes: 14 additions & 0 deletions src/TextServices.Builder.Api/Jobs/LogContextHelpers.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
using Serilog.Context;
using Serilog.Core.Enrichers;

namespace TextServices.Builder.Api.Jobs;

internal static class LogContextHelpers
{
/// <summary>
/// "CorrelationId" properties to log context, which is then output as part of default log template.
/// This is useful for filtering logs. Consists of {jobId}:{random-guid}
/// </summary>
public static IDisposable SetCorrelationId(string jobId) =>
LogContext.Push(new PropertyEnricher("CorrelationId", $"{jobId}:{Guid.NewGuid()}"));
}
Loading
Loading