From 0f2b56c7415d44aeebb7d5c9f55df051d16585f6 Mon Sep 17 00:00:00 2001 From: ta4tsering Date: Wed, 29 Oct 2025 16:23:46 +0530 Subject: [PATCH] feat: added get_span_text --- src/openpecha/pecha/__init__.py | 17 +++++++++++++++++ tests/pecha/test_get_span_text.py | 18 ++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 tests/pecha/test_get_span_text.py diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 16acde74..6163cee5 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -1,9 +1,11 @@ import json +from multiprocessing import reduction import shutil from pathlib import Path from typing import Dict, List, Optional from stam import AnnotationStore, Offset, Selector +from openpecha.pecha.annotations import span from openpecha.exceptions import StamAddAnnotationError, FileNotFoundError, MetaDataValidationError from openpecha.ids import ( @@ -27,6 +29,7 @@ def __init__(self, pecha_id: str, pecha_path: Path) -> None: self.bases = self.load_bases() self.annotations = [] + @classmethod def from_path(cls, pecha_path: Path) -> "Pecha": # Validate that the path exists @@ -252,6 +255,20 @@ def update_annotation(self, annotation_id: str, annotation:List[BaseAnnotation], self.add(annotation_id, annotation) return self + def get_base_text(self) -> str: + base_dir = self.pecha_path / "base" + for base_file in base_dir.glob("*.txt"): + base_text = base_file.read_text(encoding="utf-8") + return base_text + + def get_span_text(self, span: Optional[span] = None) -> str: + base_text = self.get_base_text() + if span is None: + return base_text + else: + return base_text[span.start:span.end] + + def get_anns(ann_store: AnnotationStore, include_span: bool = False): anns = [] diff --git a/tests/pecha/test_get_span_text.py b/tests/pecha/test_get_span_text.py new file mode 100644 index 00000000..307a37e0 --- /dev/null +++ b/tests/pecha/test_get_span_text.py @@ -0,0 +1,18 @@ +from openpecha.pecha import Pecha +from openpecha.pecha.annotations import span +from pathlib import Path + +def test_get_span_text(): + pecha = Pecha.from_path(Path("tests/pecha/data/I5003D420")) + text = pecha.get_span_text(span(start=0, end=12)) + print(text) + assert text == "In Sanskrit:" + +def test_get_span_text_without_span(): + pecha = Pecha.from_path(Path("tests/pecha/data/I5003D420")) + text = pecha.get_span_text() + assert text == pecha.get_base_text() + +if __name__ == "__main__": + test_get_span_text() + test_get_span_text_without_span() \ No newline at end of file