diff --git a/src/openpecha/buda/__init__.py b/src/openpecha/buda/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/openpecha/buda/api.py b/src/openpecha/buda/api.py deleted file mode 100644 index 65e2b7ca..00000000 --- a/src/openpecha/buda/api.py +++ /dev/null @@ -1,214 +0,0 @@ -import codecs -import csv -import gzip -import hashlib -import io -import json -import logging -from contextlib import closing - -import boto3 -import botocore -import pyewts -import rdflib -import requests -from rdflib import Literal -from rdflib.namespace import SKOS, Namespace - -LDSPDIBASEURL = "https://ldspdi.bdrc.io/" -CONVERTER = pyewts.pyewts() - -SESSION = boto3.Session() -S3 = SESSION.client("s3") - -BDR = Namespace("http://purl.bdrc.io/resource/") -BDO = Namespace("http://purl.bdrc.io/ontology/core/") -BDA = Namespace("http://purl.bdrc.io/admindata/") -ADM = Namespace("http://purl.bdrc.io/ontology/admin/") - - -def fetch_op_commits(ldspdibaseurl="http://ldspdi.bdrc.io/"): - """ - Fetches the list of all openpecha commits on BUDA - """ - res = {} - headers = {"Accept": "text/csv"} - params = {"format": "csv"} - with closing( - requests.get( - ldspdibaseurl + "/query/table/OP_allCommits", - stream=True, - headers=headers, - params=params, - ) - ) as r: - reader = csv.reader(codecs.iterdecode(r.iter_lines(), "utf-8")) - for row in reader: - if not row[0].startswith("http://purl.bdrc.io/resource/IE0OP"): - logging.error("cannot interpret csv line starting with " + row[0]) - continue - res[row[0][34:]] = row[1] - return res - - -def get_s3_folder_prefix(wlname, image_group_lname): - """ - gives the s3 prefix (~folder) in which the volume will be present. - inpire from https://github.com/buda-base/buda-iiif-presentation/blob/master/src/main/java/ - io/bdrc/iiif/presentation/ImageInfoListService.java#L73 - Example: - - wlname=W22084, image_group_lname=I0886 - - result = "Works/60/W22084/images/W22084-0886/ - where: - - 60 is the first two characters of the md5 of the string W22084 - - 0886 is: - * the image group ID without the initial "I" if the image group ID is in the form I\\d\\d\\d\\d - * or else the full image group ID (incuding the "I") - """ - md5 = hashlib.md5(str.encode(wlname)) - two = md5.hexdigest()[:2] - - pre, rest = image_group_lname[0], image_group_lname[1:] - if pre == "I" and rest.isdigit() and len(rest) == 4: - suffix = rest - else: - suffix = image_group_lname - - return "Works/{two}/{RID}/images/{RID}-{suffix}/".format( - two=two, RID=wlname, suffix=suffix - ) - - -def gets3blob(s3Key): - f = io.BytesIO() - try: - S3.download_fileobj("archive.tbrc.org", s3Key, f) - return f - except botocore.exceptions.ClientError as e: - if e.response["Error"]["Code"] == "404": - return None - else: - raise - - -def get_image_list_s3(wlname, image_group_lname): - s3key = get_s3_folder_prefix(wlname, image_group_lname) + "dimensions.json" - blob = gets3blob(s3key) - if blob is None: - return None - blob.seek(0) - b = blob.read() - ub = gzip.decompress(b) - s = ub.decode("utf8") - data = json.loads(s) - return data - - -def get_image_list_iiifpres(wlname, image_group_lname): - r = requests.get(f"http://iiifpres.bdrc.io/il/v:bdr:{wlname}") - return r.json() - - -def get_image_list(wlname, image_group_lname, source="s3", reorder_with_bvm=False): - il = None - if source == "s3": - il = get_image_list_s3(wlname, image_group_lname) - else: - il = get_image_list_iiifpres(wlname, image_group_lname) - return il - - -def _res_from_model(g, wlname): - res = { - "source_metadata": {"id": "http://purl.bdrc.io/resource/" + wlname}, - "image_groups": {}, - } - wres = BDR[wlname] - try: - adm = g.value(predicate=ADM.adminAbout, object=wres) - res["source_metadata"]["status"] = str(g.value(adm, ADM.status)) - res["source_metadata"]["access"] = str(g.value(adm, ADM.access)) - if (adm, ADM.restrictedInChina, Literal(True)) in g: - res["source_metadata"]["geo_restriction"] = ["CN"] - mwres = g.value(wres, BDO.instanceReproductionOf) - res["source_metadata"]["reproduction_of"] = str(mwres) - for _, _, cs in g.triples((mwres, BDO.copyright, None)): - res["source_metadata"]["copyright_status"] = str(cs) - if "copyright_status" not in res["source_metadata"]: - res["source_metadata"][ - "copyright_status" - ] = "http://purl.bdrc.io/resource/CopyrightPublicDomain" - res["source_metadata"]["reproduction_of"] = str(mwres) - for _, _, l in g.triples((mwres, SKOS.prefLabel, None)): - if l.language == "bo-x-ewts": - res["source_metadata"]["title"] = CONVERTER.toUnicode(l.value) - break - else: - res["source_metadata"]["title"] = l.value - res["source_metadata"]["languages"] = set() - for _, _, wa in g.triples((mwres, BDO.instanceOf, None)): - for _, _, l in g.triples((wa, BDO.language, None)): - for _, _, lt in g.triples((l, BDO.langBCP47Lang, None)): - res["source_metadata"]["languages"].add(lt.value) - for _, _, aac in g.triples((wa, BDO.creator, None)): - if (aac, BDO.role, BDR.R0ER0009) or (aac, BDO.role, BDR.R0ER0009) in g: - for _, _, p in g.triples((aac, BDO.agent, None)): - for _, _, l in g.triples((p, SKOS.prefLabel, None)): - if l.language == "bo-x-ewts": - res["source_metadata"]["author"] = CONVERTER.toUnicode( - l.value - ) - break - else: - res["source_metadata"]["author"] = l.value - res["source_metadata"]["languages"] = list(res["source_metadata"]["languages"]) - for _, _, ig in g.triples((wres, BDO.instanceHasVolume, None)): - if ( - g.value(ig, BDO.volumeNumber) is None - or g.value(ig, BDO.volumePagesTotal) is None - ): - continue - iglname = str(ig)[str(ig).rfind("/") + 1 :] - res["image_groups"][iglname] = {} - iginfo = res["image_groups"][iglname] - iginfo["id"] = str(ig) - iginfo["total_pages"] = int(g.value(ig, BDO.volumePagesTotal)) - iginfo["volume_number"] = int(g.value(ig, BDO.volumeNumber)) - iginfo["volume_pages_bdrc_intro"] = int( - g.value(ig, BDO.volumePagesTbrcIntro) - ) - for _, _, l in g.triples((ig, SKOS.prefLabel, None)): - if l.language == "bo-x-ewts": - iginfo["title"] = CONVERTER.toUnicode(l.value) - break - else: - iginfo["title"] = l.value - finally: - return res - - -def get_buda_scan_info(wlname): - headers = {"Accept": "text/turtle"} - params = {"R_RES": "bdr:" + wlname} - res = None - g = rdflib.Graph() - try: - req = requests.get( - LDSPDIBASEURL + "query/graph/OP_info", - headers=headers, - params=params, - ) - g.parse(data=req.text, format="ttl") - res = _res_from_model(g, wlname) - except Exception as e: - logging.error("get_buda_scan_info failed for " + wlname + ": " + str(e)) - finally: - return res - - -def image_group_to_folder_name(scan_id, image_group_id): - image_group_folder_part = image_group_id - pre, rest = image_group_id[0], image_group_id[1:] - if pre == "I" and rest.isdigit() and len(rest) == 4: - image_group_folder_part = rest - return scan_id + "-" + image_group_folder_part diff --git a/src/openpecha/pecha/__init__.py b/src/openpecha/pecha/__init__.py index 834ccadc..16acde74 100644 --- a/src/openpecha/pecha/__init__.py +++ b/src/openpecha/pecha/__init__.py @@ -14,7 +14,6 @@ ) from openpecha.pecha.annotations import BaseAnnotation from openpecha.pecha.layer import AnnotationType -from openpecha.pecha.metadata import PechaMetaData from openpecha.config import PECHAS_PATH BASE_NAME = str @@ -25,9 +24,8 @@ class Pecha: def __init__(self, pecha_id: str, pecha_path: Path) -> None: self.id = pecha_id self.pecha_path = pecha_path - self.metadata = self.load_metadata() self.bases = self.load_bases() - # self.annotations = self.load_annotations() + self.annotations = [] @classmethod def from_path(cls, pecha_path: Path) -> "Pecha": @@ -65,21 +63,28 @@ def create(cls, output_path: Optional[Path] = None, pecha_id: Optional[str] = No return cls(pecha_id, pecha_path) @classmethod - def create_pecha(cls, pecha_id: str, base_text: str, annotation_id: str, annotation: List[BaseAnnotation]) -> "Pecha": + def create_pecha(cls, pecha_id: str, base_text: str, annotation_id: str, annotation: List[BaseAnnotation], annotation_type: AnnotationType) -> "Pecha": pecha = cls.create(pecha_id=pecha_id) base_name = pecha.set_base(base_text) - ann_type = get_annotation_type(annotation) - ann_store, _ = pecha.add_layer(base_name=base_name, layer_type=ann_type, annotation_id=annotation_id) - + ann_store, _ = pecha.add_layer(base_name=base_name, layer_type=annotation_type, annotation_id=annotation_id) for single_annotation in annotation: - ann_store = pecha.add_annotation(ann_store=ann_store, annotation=single_annotation, layer_type=ann_type) + ann_store = pecha.add_annotation(ann_store=ann_store, annotation=single_annotation, layer_type=annotation_type) ann_store.save() + annotations = get_anns(ann_store, include_span=True) + for annotation in annotations: + pecha.annotations.append({ + "span": { + "start": annotation["span"]["start"], + "end": annotation["span"]["end"], + }, + "id": annotation["id"] + }) return pecha - def add(self, annotation_id: str, annotation: List[BaseAnnotation]) -> "Pecha": + def add(self, annotation_id: str, annotation: List[BaseAnnotation], annotation_type: AnnotationType) -> "Pecha": base_name = next(iter(self.bases)) - ann_type = get_annotation_type(annotation) + ann_type = annotation_type if check_annotation_exists(self.layer_path/base_name/f"{ann_type.value}-{annotation_id}.json"): raise ValueError(f"Annotation with id {annotation_id} already exists") ann_store, _ = self.add_layer(base_name=base_name, layer_type=ann_type, annotation_id=annotation_id) @@ -102,20 +107,6 @@ def layer_path(self): layer_path.mkdir(parents=True, exist_ok=True) return layer_path - @property - def metadata_path(self): - return self.pecha_path / "metadata.json" - - - def load_metadata(self): - if not self.metadata_path.exists(): - return None - - with open(self.metadata_path) as f: - metadata = json.load(f) - - return PechaMetaData(**metadata) - def load_bases(self): bases = {} for base_file in self.base_path.rglob("*.txt"): @@ -189,7 +180,6 @@ def add_annotation( # Add Annotation Group Type ann_group_type = layer_type.annotation_group_type ann_data[ann_group_type.value] = layer_type.value - start, end = ( annotation.span.start, annotation.span.end, @@ -219,32 +209,9 @@ def add_annotation( raise StamAddAnnotationError( f"[Error] Failed to add annotation to STAM: {e}" ) + return ann_store - def set_metadata(self, pecha_metadata: Dict): - # Retrieve parser name - parser_name = self.metadata.parser if self.metadata else None - if "parser" not in pecha_metadata: - pecha_metadata["parser"] = parser_name - - # Retrieve initial creation type name - initial_creation_type = ( - self.metadata.initial_creation_type if self.metadata else None - ) - if "initial_creation_type" not in pecha_metadata: - pecha_metadata["initial_creation_type"] = initial_creation_type - - try: - pecha_metadata = PechaMetaData(**pecha_metadata) - except Exception as e: - raise ValueError(f"Invalid metadata: {e}") - - self.metadata = pecha_metadata - with open(self.metadata_path, "w") as f: - json.dump(self.metadata.to_dict(), f, ensure_ascii=False, indent=2) - - return self.metadata - def get_segmentation_layer_path(self) -> str: """ 1. Get the first layer file from the pecha @@ -256,11 +223,6 @@ def get_segmentation_layer_path(self) -> str: return relative_layer_path - def get_first_layer_path(self) -> str: - layer_path = list(self.layer_path.rglob("*.json"))[0] - relative_layer_path = layer_path.relative_to(self.pecha_path.parent).as_posix() - - return relative_layer_path def get_layer_by_ann_type(self, base_name: str, layer_type: AnnotationType): """ @@ -296,7 +258,10 @@ def get_anns(ann_store: AnnotationStore, include_span: bool = False): for ann in ann_store: ann_data = {} for data in ann: - ann_data[data.key().id()] = data.value().get() + k = data.key().id() + if k in ["index"]: + continue + ann_data[k] = data.value().get() curr_ann = {**ann_data, "text": str(ann)} if include_span: curr_ann["span"] = { @@ -310,15 +275,6 @@ def get_anns(ann_store: AnnotationStore, include_span: bool = False): def load_layer(path: Path) -> AnnotationStore: return AnnotationStore(file=str(path)) - -def get_annotation_type(annotation: List[BaseAnnotation]): - if hasattr(annotation[0], "alignment_index") and hasattr(annotation[0], "index"): - return AnnotationType.ALIGNMENT - elif hasattr(annotation[0], "index") and not hasattr(annotation[0], "alignment_index"): - return AnnotationType.SEGMENTATION - else: - raise ValueError("Invalid annotation type") - def check_annotation_exists(annotation_path: Path): if annotation_path.exists(): return True diff --git a/src/openpecha/pecha/annotations.py b/src/openpecha/pecha/annotations.py index 44f48689..0d37c6b4 100644 --- a/src/openpecha/pecha/annotations.py +++ b/src/openpecha/pecha/annotations.py @@ -40,25 +40,26 @@ def end_must_not_be_less_than_start(self) -> "span": class BaseAnnotation(BaseModel): span: span - metadata: Optional[Dict] = None model_config = ConfigDict(extra="allow") def get_dict(self): res = self.model_dump() # Remove span from the dictionary - res.pop("span") + to_remove_keys = ["span"] + for key in to_remove_keys: + res.pop(key) # Remove None values from the dictionary res = {k: v for k, v in res.items() if v is not None} return res class SegmentationAnnotation(BaseAnnotation): - index: int + id: str = Field(..., description="Annotation ID") class AlignmentAnnotation(BaseAnnotation): - index: int + id: str = Field(..., description="Annotation ID") alignment_index: list[int] = Field( description="Index of the alignment, which can be of translation or commentary" ) diff --git a/src/openpecha/pecha/metadata.py b/src/openpecha/pecha/metadata.py deleted file mode 100644 index 9dc4a40f..00000000 --- a/src/openpecha/pecha/metadata.py +++ /dev/null @@ -1,263 +0,0 @@ -from datetime import datetime -from enum import Enum -from typing import Dict, List, Optional - -from pydantic import BaseModel, ConfigDict, field_serializer, model_validator - -from openpecha.ids import get_initial_pecha_id - - -class InitialCreationType(Enum): - ocr = "ocr" - ebook = "ebook" - input = "input" - tmx = "tmx" - json = "json" - google_docx = "google_docx" - - -class Language(Enum): - tibetan = "bo" - english = "en" - literal_chinese = "lzh" - chinese = "zh" - sanskrit = "sa" - italian = "it" - russian = "ru" - hindi = "hi" - - -class CopyrightStatus(Enum): - UNKNOWN = "Unknown" - COPYRIGHTED = "In copyright" - PUBLIC_DOMAIN = "Public domain" - - -class Copyright(BaseModel): - status: CopyrightStatus = CopyrightStatus.UNKNOWN # noqa - notice: Optional[str] = "" - info_url: Optional[str] = None - - model_config = ConfigDict(extra="forbid") - - -Copyright_copyrighted = Copyright( - status=CopyrightStatus.COPYRIGHTED, - notice="In copyright by the original author or editor", - info_url="http://rightsstatements.org/vocab/InC/1.0/", -) - -Copyright_unknown = Copyright( - status=CopyrightStatus.UNKNOWN, - notice="Copyright Undertermined", - info_url="http://rightsstatements.org/vocab/UND/1.0/", -) - -Copyright_public_domain = Copyright( - status=CopyrightStatus.PUBLIC_DOMAIN, - notice="Public domain", - info_url="https://creativecommons.org/publicdomain/mark/1.0/", -) - - -class LicenseType(Enum): - # based on https://creativecommons.org/licenses/ - - CC0 = "CC0" - PUBLIC_DOMAIN_MARK = "Public Domain Mark" - CC_BY = "CC BY" - CC_BY_SA = "CC BY-SA" - CC_BY_ND = "CC BY-ND" - CC_BY_NC = "CC BY-NC" - CC_BY_NC_SA = "CC BY-NC-SA" - CC_BY_NC_ND = "CC BY-NC-ND" - - UNDER_COPYRIGHT = "under copyright" - UNKNOWN = "Unknown" - - -class PechaMetaData(BaseModel): - id: str - title: Optional[Dict[str, str] | str] = None - author: Optional[List[str] | Dict[str, str] | str] = None - imported: Optional[datetime] = None - source: Optional[str] = None - toolkit_version: str - parser: str - initial_creation_type: InitialCreationType - language: Optional[Language] = None - source_metadata: Dict = {} - bases: Dict = {} - copyright: Copyright = Copyright() - licence: LicenseType = LicenseType.UNKNOWN - - model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True) - # Optional fields from both classes - legacy_id: Optional[str] = None - source_file: Optional[str] = None - - # Metadata fields - ocr_import_info: Optional[Dict] = None - statistics: Optional[Dict] = None - quality: Optional[Dict] = None - - # Time tracking - last_modified: Optional[datetime] = None - - # Validators from both classes - @model_validator(mode="before") - def set_id(cls, values): - if "id" not in values or values["id"] is None: - values["id"] = get_initial_pecha_id() - return values - - # @model_validator(mode="before") - # def validate_parser(cls, values): - # if "parser" in values and values["parser"]: - # parser_classes = cls.get_toolkit_parsers() - # if values["parser"] not in [name for name, _ in parser_classes]: - # raise ValueError( - # f"Parser {values['parser']} not in the Toolkit parsers." - # ) - # return values - - @model_validator(mode="before") - def set_toolkit_version(cls, values): - if "toolkit_version" not in values or values["toolkit_version"] is None: - try: - from importlib.metadata import PackageNotFoundError, version - - toolkit_version = version("openpecha") - values["toolkit_version"] = toolkit_version - except PackageNotFoundError as e: - raise RuntimeError("Package 'openpecha' not found.") from e - except Exception as e: - raise RuntimeError(f"Error fetching toolkit version: {str(e)}") from e - return values - - @model_validator(mode="before") - def set_imported(cls, values): - if "imported" not in values or values["imported"] is None: - values["imported"] = datetime.now() - return values - - @model_validator(mode="before") - def set_last_modified(cls, values): - if "last_modified" not in values or values["last_modified"] is None: - values["last_modified"] = datetime.now() - return values - - @model_validator(mode="before") - def set_copyright_info(cls, values): - if "copyright" not in values or values["copyright"] is None: - values["copyright"] = Copyright() - return values - - # Serializers for complex types - @field_serializer("imported", mode="plain") - def serialize_imported(self, value: Optional[datetime]) -> Optional[str]: - return value.isoformat() if value else None - - @field_serializer("last_modified", mode="plain") - def serialize_last_modified(self, value: Optional[datetime]) -> Optional[str]: - return value.isoformat() if value else None - - @field_serializer("licence", mode="plain") - def serialize_licence(self, value: Optional[LicenseType]) -> Optional[str]: - return value.value if value else None - - @field_serializer("language", mode="plain") - def serialize_language(self, value: Optional[Language]) -> Optional[str]: - return value.value if value else None - - @field_serializer("initial_creation_type", mode="plain") - def serialize_initial_creation_type( - self, value: Optional[InitialCreationType] - ) -> Optional[str]: - return value.value if value else None - - @field_serializer("copyright", mode="plain") - def serialize_copyright(self, value: Optional[Copyright]) -> Optional[Dict]: - if not value: - return None - return { - "status": value.status.value, - "notice": value.notice, - "info_url": value.info_url, - } - - def update_last_modified_date(self): - self.last_modified = datetime.now() - - # @classmethod - # def get_toolkit_parsers(cls): - # # List to store all classes from the package - # all_classes = [] - # import sys - - # base_path = Path(__file__).parent / "parsers" - # pecha_parser_path = "openpecha.pecha.parsers" - - # for py_file in base_path.rglob("*.py"): - # path_parts = list(py_file.parts) - # path_parts[-1] = path_parts[-1].replace(".py", "") - # if path_parts[-1] == "__init__": - # path_parts.pop() - - # if path_parts[0] == "/": - # path_parts.pop(0) - - # start_index = path_parts.index(pecha_parser_path.split(".")[0]) - # parser_path = ".".join(path_parts[start_index:]) - # importlib.import_module(parser_path) - # classes = inspect.getmembers(sys.modules[parser_path], inspect.isclass) - # all_classes.extend(classes) - - # parsers = importlib.import_module("openpecha.pecha.parsers") - # parser_classes = [ - # (name, class_) - # for name, class_ in all_classes - # if issubclass(class_, parsers.BaseParser) - # or issubclass(class_, parsers.OCRBaseParser) - # and class_ is not parsers.BaseParser - # and class_ is not parsers.OCRBaseParser - # ] - # return parser_classes - - def to_dict(self): - """ - Prepare PechaMetaData attribute to be JSON serializable - """ - data = self.model_dump() - - # Dynamically get standard fields from the model - standard_fields = list(set(type(self).model_fields.keys())) - - # Move any extra fields to source_metadata - extra_fields = {} - for k, v in data.items(): - if k not in standard_fields: - if isinstance(v, Enum): - extra_fields[k] = v.value - elif isinstance(v, datetime): - extra_fields[k] = v.isoformat() - else: - extra_fields[k] = v - - if "source_metadata" not in data: - data["source_metadata"] = {} - data["source_metadata"].update(extra_fields) - - # Remove extra fields from the top-level data - for field in extra_fields: - del data[field] - - return data - - -class InitialPechaMetadata(PechaMetaData): - @model_validator(mode="before") - def set_id(cls, values): - if "id" not in values or values["id"] is None: - values["id"] = get_initial_pecha_id() - return values diff --git a/src/openpecha/pecha/parsers/edition.py b/src/openpecha/pecha/parsers/edition.py index 27a028a1..7bb996e5 100644 --- a/src/openpecha/pecha/parsers/edition.py +++ b/src/openpecha/pecha/parsers/edition.py @@ -16,6 +16,7 @@ from openpecha.pecha.layer import AnnotationType from openpecha.pecha.parsers import update_coords from openpecha.pecha.serializers.json import JsonSerializer +from openpecha.ids import get_annotation_id logger = get_logger(__name__) @@ -44,8 +45,8 @@ def parse_segmentation(self, segments: list[str]) -> list[SegmentationAnnotation for index, segment in enumerate(segments, start=1): anns.append( SegmentationAnnotation( + id=str(index), span=span(start=char_count, end=char_count + len(segment)), - index=index, ) ) char_count += len(segment) + 1 diff --git a/src/openpecha/utils.py b/src/openpecha/utils.py index 750b8a49..b51baca0 100644 --- a/src/openpecha/utils.py +++ b/src/openpecha/utils.py @@ -27,7 +27,7 @@ def read_csv(file_path) -> List[List[str]]: with open(file_path, newline="", encoding="utf-8") as file: reader = csv.reader(file) rows = list(reader) - return rows +# return rows def write_csv(file_path, data) -> None: @@ -58,5 +58,5 @@ def write_json( def convert_to_base_annotation(raw_annotation): span_data = raw_annotation["span"] annotation_span = span(start=span_data["start"], end=span_data["end"]) - annotation_data = {k: v for k, v in raw_annotation.items() if k != "span"} + annotation_data = {k: v for k, v in raw_annotation.items() if k != "span" and k != "index"} return BaseAnnotation(span=annotation_span, **annotation_data) \ No newline at end of file diff --git a/tests/buda/data/OP_info-W12827.ttl b/tests/buda/data/OP_info-W12827.ttl deleted file mode 100644 index 7440d773..00000000 --- a/tests/buda/data/OP_info-W12827.ttl +++ /dev/null @@ -1,191 +0,0 @@ -@prefix : . -@prefix aut: . -@prefix bdan: . -@prefix bd: . -@prefix bf: . -@prefix owl: . -@prefix tbr: . -@prefix bdou: . -@prefix rsh: . -@prefix xsd: . -@prefix admin: . -@prefix skos: . -@prefix rdfs: . -@prefix bdac: . -@prefix wd: . -@prefix dr: . -@prefix oa: . -@prefix dila: . -@prefix sh: . -@prefix tmp: . -@prefix dcterms: . -@prefix text: . -@prefix bda: . -@prefix foaf: . -@prefix bdd: . -@prefix ad: . -@prefix bdg: . -@prefix f: . -@prefix vcard: . -@prefix adm: . -@prefix bdo: . -@prefix iiif2: . -@prefix iiif3: . -@prefix adr: . -@prefix viaf: . -@prefix bds: . -@prefix eftr: . -@prefix bdr: . -@prefix bdu: . -@prefix as: . -@prefix rdf: . -@prefix tm: . -@prefix ldp: . - -bdr:I2062 a bdo:ImageGroup ; - bdo:volumeNumber 2 ; - bdo:volumeOf bdr:W12827 ; - bdo:volumePagesTbrcIntro 0 ; - skos:prefLabel "volume 1"@en ; - bdo:volumePagesTotal 493 . - -bdr:I2068 a bdo:ImageGroup ; - bdo:volumeNumber 8 ; - bdo:volumeOf bdr:W12827 ; - bdo:volumePagesTbrcIntro 0 ; - skos:prefLabel "pod 1"@bo-x-ewts ; - bdo:volumePagesTotal 535 . - -bdr:I2071 a bdo:ImageGroup ; - bdo:volumeNumber 11 ; - bdo:volumeOf bdr:W12827 ; - bdo:volumePagesTbrcIntro 0 ; - bdo:volumePagesTotal 525 . - -bdr:LangBo a bdo:Language , owl:Class ; - rdfs:seeAlso , ; - rdfs:subClassOf bdo:Language ; - bdo:langBCP47Lang "bo" ; - bdo:langMARCCode "tib" ; - skos:prefLabel "藏文"@zh-hans , "bod yig"@bo-x-ewts , "Tibetan"@en . - -bdr:I2061 a bdo:ImageGroup ; - bdo:volumeNumber 1 ; - bdo:volumeOf bdr:W12827 ; - bdo:volumePagesTbrcIntro 0 ; - bdo:volumePagesTotal 459 . - -bdr:I2067 a bdo:ImageGroup ; - bdo:volumeNumber 7 ; - bdo:volumeOf bdr:W12827 ; - bdo:volumePagesTbrcIntro 0 ; - bdo:volumePagesTotal 645 . - -bdr:I2070 a bdo:ImageGroup ; - bdo:volumeNumber 10 ; - bdo:volumeOf bdr:W12827 ; - bdo:volumePagesTbrcIntro 0 ; - bdo:volumePagesTotal 547 . - -bdr:I2066 a bdo:ImageGroup ; - bdo:volumeNumber 6 ; - bdo:volumeOf bdr:W12827 ; - bdo:volumePagesTbrcIntro 0 ; - bdo:volumePagesTotal 729 . - -bdr:W12827 a bdo:DigitalInstance , bdo:Instance , bdo:ImageInstance ; - bdo:inCollection bdr:PRHD01 , bdr:PR01DOR0 , bdr:PR01JW33478 , bdr:PR1PL480 ; - bdo:instanceHasVolume bdr:I2072 , bdr:I2071 , bdr:I2066 , bdr:I2073 , bdr:I2062 , bdr:I2068 , bdr:I2067 , bdr:I2064 , bdr:I2061 , bdr:I2063 , bdr:I2070 , bdr:I2069 , bdr:I2065 ; - bdo:instanceOf bdr:WA12827 ; - bdo:instanceReproductionOf bdr:MW12827 ; - bdo:isRoot true ; - bdo:numberOfVolumes 13 ; - tmp:thumbnailIIIFService . - -bda:W12827 a adm:AdminData ; - adm:access bda:AccessOpen ; - adm:adminAbout bdr:W12827 ; - adm:contentLegal bda:LD_BDRC_PD ; - adm:facetIndex 66 ; - adm:gitPath "d3/W12827.trig" ; - adm:gitRepo bda:GR0014 ; - adm:gitRevision "e1b570dbf911313d270802fce1bb694a51d8a2e1" ; - adm:graphId bdg:W12827 ; - adm:metadataLegal bda:LD_BDRC_CC0 ; - adm:restrictedInChina true ; - adm:status bda:StatusReleased ; - bdo:isRoot true . - -bdr:I2065 a bdo:ImageGroup ; - bdo:volumeNumber 5 ; - bdo:volumeOf bdr:W12827 ; - bdo:volumePagesTbrcIntro 0 ; - bdo:volumePagesTotal 617 . - -bdr:WA12827 a bdo:Work ; - bdo:catalogInfo "The Nyingtik Yabzhi of Longchen Rabjam Drime Ozer (1308-1364). Collection of profound Dzogchen teachings. Consists of the Kandro Nyingtik, Kandro Yangtik, Bima Nyingtik, Lama Yangtik, and Zabmo Yangtik. Scanned with the generous support of Master Tam Shek-Wing of the Vajrayana Buddhist Association."@en ; - bdo:creator bdr:CR55948F3FDC9CACD3 ; - bdo:isRoot true ; - bdo:language bdr:LangBo ; - bdo:workHasInstance bdr:W1KG12048 , bdr:MW2PD19078 , bdr:MW1KG12048 , bdr:MW12827 , bdr:W3CN3025 , bdr:MW1KG9720 , bdr:W2PD19078 , bdr:MW4PD2049 , bdr:W4PD2043 , bdr:MW4PD2043 , bdr:W12827 , bdr:W4PD2049 , bdr:MW3CN3025 , bdr:W1KG9720 ; - bdo:workIsAbout bdr:T354 , bdr:WA3JT13386 ; - tmp:entityScore 28 ; - skos:prefLabel "snying thig ya bzhi/"@bo-x-ewts . - -bdr:I2064 a bdo:ImageGroup ; - bdo:volumeNumber 4 ; - bdo:volumeOf bdr:W12827 ; - bdo:volumePagesTbrcIntro 0 ; - bdo:volumePagesTotal 597 . - -bdr:I2073 a bdo:ImageGroup ; - bdo:volumeNumber 13 ; - bdo:volumeOf bdr:W12827 ; - bdo:volumePagesTbrcIntro 0 ; - bdo:volumePagesTotal 571 . - -bdr:P1583 skos:prefLabel "klong chen rab 'byams pa dri med 'od zer/"@bo-x-ewts , "隆钦热降巴·赤墨俄色"@zh-hans . - -bdr:I2063 a bdo:ImageGroup ; - bdo:volumeNumber 3 ; - bdo:volumeOf bdr:W12827 ; - bdo:volumePagesTbrcIntro 0 ; - bdo:volumePagesTotal 451 . - -bdr:I2069 a bdo:ImageGroup ; - bdo:volumeNumber 9 ; - bdo:volumeOf bdr:W12827 ; - bdo:volumePagesTbrcIntro 0 ; - bdo:volumePagesTotal 559 . - -bdr:CR55948F3FDC9CACD3 - a bdo:AgentAsCreator ; - bdo:agent bdr:P1583 ; - bdo:role bdr:R0ER0019 . - -bdr:MW12827 a bdo:Instance ; - bf:identifiedBy bdr:ID9E0972E23D10FF0F , bdr:ID4CF611CFA57E16D8 , bdr:IDEC1B3ED2EB801B0F ; - bdo:authorshipStatement "arranged and structured by klon-chen rab-'byams-pa"@en ; - bdo:biblioNote "reproduced from a set of prints from a-'dzom chos-sgar blocksv. 7-13. published by talung tsetrul pema wangyal, darjeeling, w.b."@en ; - bdo:extentStatement "13 v." ; - bdo:hasPart bdr:MW12827_BB8776 , bdr:MW12827_C93AD5 , bdr:MW12827_CBDB0C , bdr:MW12827_8E3796 , bdr:MW12827_58921B ; - bdo:hasSourcePrintery bdr:G3JT12503 ; - bdo:hasTitle bdr:TT7FFDCDE93527101E ; - bdo:instanceEvent bdr:EVF192DCB4E6693489 ; - bdo:instanceHasReproduction bdr:W12827 ; - bdo:instanceOf bdr:WA12827 ; - bdo:isRoot true ; - bdo:note bdr:NT7E4CD6992DD16DE1 ; - bdo:numberOfVolumes 13 ; - bdo:printMethod bdr:PrintMethod_Relief_WoodBlock ; - bdo:publisherLocation "delhi"@en ; - bdo:publisherName "sherab gyaltsen lama"@en ; - bdo:script bdr:ScriptTibt ; - tmp:thumbnailIIIFService ; - skos:prefLabel "snying thig ya bzhi/"@bo-x-ewts . - -bdr:I2072 a bdo:ImageGroup ; - bdo:volumeNumber 12 ; - bdo:volumeOf bdr:W12827 ; - bdo:volumePagesTbrcIntro 0 ; - bdo:volumePagesTotal 489 . diff --git a/tests/buda/data/expected-W12827.json b/tests/buda/data/expected-W12827.json deleted file mode 100644 index 83d1ec25..00000000 --- a/tests/buda/data/expected-W12827.json +++ /dev/null @@ -1,95 +0,0 @@ -{ - "source_metadata":{ - "id":"http://purl.bdrc.io/resource/W12827", - "status":"http://purl.bdrc.io/admindata/StatusReleased", - "access":"http://purl.bdrc.io/admindata/AccessOpen", - "reproduction_of":"http://purl.bdrc.io/resource/MW12827", - "copyright_status":"http://purl.bdrc.io/resource/CopyrightPublicDomain", - "languages": ["bo"], - "title":"སྙིང་ཐིག་ཡ་བཞི།", - "author":"ཀློང་ཆེན་རབ་འབྱམས་པ་དྲི་མེད་འོད་ཟེར།", - "geo_restriction": ["CN"] - }, - "image_groups":{ - "I2072":{ - "id":"http://purl.bdrc.io/resource/I2072", - "total_pages":489, - "volume_number":12, - "volume_pages_bdrc_intro":0 - }, - "I2071":{ - "id":"http://purl.bdrc.io/resource/I2071", - "total_pages":525, - "volume_number":11, - "volume_pages_bdrc_intro":0 - }, - "I2066":{ - "id":"http://purl.bdrc.io/resource/I2066", - "total_pages":729, - "volume_number":6, - "volume_pages_bdrc_intro":0 - }, - "I2073":{ - "id":"http://purl.bdrc.io/resource/I2073", - "total_pages":571, - "volume_number":13, - "volume_pages_bdrc_intro":0 - }, - "I2062":{ - "id":"http://purl.bdrc.io/resource/I2062", - "total_pages":493, - "volume_number":2, - "volume_pages_bdrc_intro":0, - "title":"volume 1" - }, - "I2068":{ - "id":"http://purl.bdrc.io/resource/I2068", - "total_pages":535, - "volume_number":8, - "volume_pages_bdrc_intro":0, - "title":"པོད ༡" - }, - "I2067":{ - "id":"http://purl.bdrc.io/resource/I2067", - "total_pages":645, - "volume_number":7, - "volume_pages_bdrc_intro":0 - }, - "I2064":{ - "id":"http://purl.bdrc.io/resource/I2064", - "total_pages":597, - "volume_number":4, - "volume_pages_bdrc_intro":0 - }, - "I2061":{ - "id":"http://purl.bdrc.io/resource/I2061", - "total_pages":459, - "volume_number":1, - "volume_pages_bdrc_intro":0 - }, - "I2063":{ - "id":"http://purl.bdrc.io/resource/I2063", - "total_pages":451, - "volume_number":3, - "volume_pages_bdrc_intro":0 - }, - "I2070":{ - "id":"http://purl.bdrc.io/resource/I2070", - "total_pages":547, - "volume_number":10, - "volume_pages_bdrc_intro":0 - }, - "I2069":{ - "id":"http://purl.bdrc.io/resource/I2069", - "total_pages":559, - "volume_number":9, - "volume_pages_bdrc_intro":0 - }, - "I2065":{ - "id":"http://purl.bdrc.io/resource/I2065", - "total_pages":617, - "volume_number":5, - "volume_pages_bdrc_intro":0 - } - } -} \ No newline at end of file diff --git a/tests/buda/test_buda_api.py b/tests/buda/test_buda_api.py deleted file mode 100644 index f147bafd..00000000 --- a/tests/buda/test_buda_api.py +++ /dev/null @@ -1,16 +0,0 @@ -import json -from pathlib import Path - -import rdflib - -from openpecha.buda.api import _res_from_model - - -def test_buda_info_from_model(): - ttl_path = Path(__file__).parent / "data" / "OP_info-W12827.ttl" - g = rdflib.Graph().parse(str(ttl_path), format="ttl") - res = _res_from_model(g, "W12827") - expected_path = Path(__file__).parent / "data" / "expected-W12827.json" - with open(expected_path) as expected_file: - expected = json.load(expected_file) - assert res == expected diff --git a/tests/pecha/parser/edition/test_edition.py b/tests/pecha/parser/edition/test_edition.py index 847152f3..b3386939 100644 --- a/tests/pecha/parser/edition/test_edition.py +++ b/tests/pecha/parser/edition/test_edition.py @@ -14,6 +14,7 @@ from openpecha.pecha.parsers.edition import EditionParser from openpecha.pecha.serializers.json import JsonSerializer from openpecha.utils import read_json +from openpecha.pecha.annotations import VersionVariantOperations class TestEditionParser(TestCase): @@ -36,16 +37,16 @@ def test_segmentation_parse(self): anns = parser.parse_segmentation(segments) expected_anns = [ - SegmentationAnnotation(span=span(start=0, end=87), index=1), - SegmentationAnnotation(span=span(start=88, end=207), index=2), - SegmentationAnnotation(span=span(start=208, end=283), index=3), - SegmentationAnnotation(span=span(start=284, end=361), index=4), - SegmentationAnnotation(span=span(start=362, end=508), index=5), - SegmentationAnnotation(span=span(start=509, end=844), index=6), - SegmentationAnnotation(span=span(start=845, end=1129), index=7), - SegmentationAnnotation(span=span(start=1130, end=1217), index=8), - SegmentationAnnotation(span=span(start=1218, end=1409), index=9), - SegmentationAnnotation(span=span(start=1410, end=1605), index=10), + SegmentationAnnotation(span=span(start=0, end=87), id="1"), + SegmentationAnnotation(span=span(start=88, end=207), id="2"), + SegmentationAnnotation(span=span(start=208, end=283), id="3"), + SegmentationAnnotation(span=span(start=284, end=361), id="4"), + SegmentationAnnotation(span=span(start=362, end=508), id="5"), + SegmentationAnnotation(span=span(start=509, end=844), id="6"), + SegmentationAnnotation(span=span(start=845, end=1129), id="7"), + SegmentationAnnotation(span=span(start=1130, end=1217), id="8"), + SegmentationAnnotation(span=span(start=1218, end=1409), id="9"), + SegmentationAnnotation(span=span(start=1410, end=1605), id="10"), ] assert anns == expected_anns @@ -59,34 +60,34 @@ def test_segmentation_parse(self): updated_anns = update_coords(anns, old_base, new_base) expected_updated_anns = [ SegmentationAnnotation( - span=span(start=0, end=87, errors=None), metadata=None, index=1 + span=span(start=0, end=87, errors=None), id="1" ), SegmentationAnnotation( - span=span(start=88, end=208, errors=None), metadata=None, index=2 + span=span(start=88, end=208, errors=None), id="2" ), SegmentationAnnotation( - span=span(start=209, end=284, errors=None), metadata=None, index=3 + span=span(start=209, end=284, errors=None), id="3" ), SegmentationAnnotation( - span=span(start=285, end=363, errors=None), metadata=None, index=4 + span=span(start=285, end=363, errors=None), id="4" ), SegmentationAnnotation( - span=span(start=364, end=511, errors=None), metadata=None, index=5 + span=span(start=364, end=511, errors=None), id="5" ), SegmentationAnnotation( - span=span(start=512, end=843, errors=None), metadata=None, index=6 + span=span(start=512, end=843, errors=None), id="6" ), SegmentationAnnotation( - span=span(start=843, end=1089, errors=None), metadata=None, index=7 + span=span(start=843, end=1089, errors=None), id="7" ), SegmentationAnnotation( - span=span(start=1090, end=1221, errors=None), metadata=None, index=8 + span=span(start=1090, end=1221, errors=None), id="8" ), SegmentationAnnotation( - span=span(start=1222, end=1411, errors=None), metadata=None, index=9 + span=span(start=1222, end=1411, errors=None), id="9" ), SegmentationAnnotation( - span=span(start=1412, end=1626, errors=None), metadata=None, index=10 + span=span(start=1412, end=1626, errors=None), id="10" ), ] @@ -100,37 +101,37 @@ def test_version_parse(self): new_base = "Hello World" diffs = parser.parse_version(old_base, new_base) assert diffs == [ - Version(span=span(start=5, end=5), operation="insertion", text=" World") + Version(span=span(start=5, end=5), operation=VersionVariantOperations.INSERTION, text=" World") ] # Deletion old_base = "Hello World" new_base = "Hello" diffs = parser.parse_version(old_base, new_base) - assert diffs == [Version(span=span(start=5, end=11), operation="deletion")] + assert diffs == [Version(span=span(start=5, end=11), operation=VersionVariantOperations.DELETION)] # Insertion in Between old_base = "Hello World" new_base = "Hello!! World" diffs = parser.parse_version(old_base, new_base) assert diffs == [ - Version(span=span(start=5, end=5), operation="insertion", text="!!") + Version(span=span(start=5, end=5), operation=VersionVariantOperations.INSERTION, text="!!") ] # Deletion in Between old_base = "Good morning, Everyone" new_base = "Good Everyone" diffs = parser.parse_version(old_base, new_base) - assert diffs == [Version(span=span(start=4, end=13), operation="deletion")] + assert diffs == [Version(span=span(start=4, end=13), operation=VersionVariantOperations.DELETION)] # Insertion and Deletion old_base = "Good morning, Ladies and Gentlemen" new_base = "Good Attractive Ladies and Gentlemen" diffs = parser.parse_version(old_base, new_base) assert diffs == [ - Version(span=span(start=5, end=13), operation="deletion"), + Version(span=span(start=5, end=13), operation=VersionVariantOperations.DELETION), Version( - span=span(start=13, end=13), operation="insertion", text="Attractive" + span=span(start=13, end=13), operation=VersionVariantOperations.INSERTION, text="Attractive" ), ] @@ -141,95 +142,81 @@ def test_version_parse(self): segments = self.txt_file.read_text(encoding="utf-8").splitlines() new_base = "\n".join(segments) diffs = parser.parse_version(old_base, new_base) + assert diffs == [ Version( span=span(start=87, end=87, errors=None), - metadata=None, - operation="insertion", + operation=VersionVariantOperations.INSERTION, text="\n", ), Version( span=span(start=282, end=282, errors=None), - metadata=None, - operation="insertion", + operation=VersionVariantOperations.INSERTION, text="\n", ), Version( span=span(start=673, end=674, errors=None), - metadata=None, - operation="deletion", + operation=VersionVariantOperations.DELETION, text="", ), Version( span=span(start=888, end=888, errors=None), - metadata=None, - operation="insertion", + operation=VersionVariantOperations.INSERTION, text=" རྟག་ཏུ་ཚུལ་ཁྲིམས་ཡང་དག་བླངས་ནས་གནས་པར་འགྱུར།", ), Version( span=span(start=1034, end=1080, errors=None), - metadata=None, - operation="deletion", + operation=VersionVariantOperations.DELETION, text="", ), Version( span=span(start=1080, end=1080, errors=None), - metadata=None, - operation="insertion", + operation=VersionVariantOperations.INSERTION, text="འགྲོ་བ་དགྲོལ་བར་བྱ་ཕྱིར་ཡོངས་སུ་བསྔོ་བྱེད་ཅིང༌", ), Version( span=span(start=1083, end=1083, errors=None), - metadata=None, - operation="insertion", + operation=VersionVariantOperations.INSERTION, text="\n", ), Version( span=span(start=1170, end=1213, errors=None), - metadata=None, - operation="deletion", + operation=VersionVariantOperations.DELETION, text="", ), Version( span=span(start=1279, end=1279, errors=None), - metadata=None, - operation="insertion", + operation=VersionVariantOperations.INSERTION, text="པར་", ), Version( span=span(start=1322, end=1323, errors=None), - metadata=None, - operation="deletion", + operation=VersionVariantOperations.DELETION, text="", ), Version( span=span(start=1323, end=1323, errors=None), - metadata=None, - operation="insertion", + operation=VersionVariantOperations.INSERTION, text="བ", ), Version( span=span(start=1441, end=1444, errors=None), - metadata=None, - operation="deletion", + operation=VersionVariantOperations.DELETION, text="", ), Version( span=span(start=1497, end=1500, errors=None), - metadata=None, - operation="deletion", + operation=VersionVariantOperations.DELETION, text="", ), Version( span=span(start=1573, end=1585, errors=None), - metadata=None, - operation="deletion", + operation=VersionVariantOperations.DELETION, text="", ), Version( span=span(start=1616, end=1617, errors=None), - metadata=None, - operation="deletion", + operation=VersionVariantOperations.DELETION, text="", ), ] @@ -244,70 +231,71 @@ def test_parse(self): ann_store=AnnotationStore(file=str(self.pecha.layer_path / seg_layer_path)), include_span=True, ) + expected_seg_anns = [ { - "index": 1, + "id": "1", "segmentation_type": "segmentation", "text": "བུ་མ་འཇུག་པ་ལས་སེམས་བསྐྱེད་དྲུག་པ། ཤོ་ལོ་ཀ ༡-༦༤ མངོན་དུ་ཕྱོགས་པར་མཉམ་བཞག་སེམས་གནས་ཏེ། །", "span": {"start": 0, "end": 87}, }, { - "index": 2, + "id": "2", "segmentation_type": "segmentation", "text": "ྫོགས་པའི་སངས་རྒྱས་ཆོས་ལ་མངོན་ཕྱོགས་ཤིང༌། །འདི་བརྟེན་འབྱུང་བའི་དེ་ཉིད་མཐོང་བ་དེས། །ཤེས་རབ་གནས་པས་འགོག་པ་ཐོབ་པར་འགྱུར། །", "span": {"start": 88, "end": 206}, }, { - "index": 3, + "id": "3", "segmentation_type": "segmentation", "text": "ཇི་ལྟར་ལོང་བའི་ཚོགས་ཀུན་བདེ་བླག་ཏུ། །མིག་ལྡན་སྐྱེས་བུ་གཅིག་གིས་འདོད་པ་ཡི། །", "span": {"start": 207, "end": 282}, }, { - "index": 4, + "id": "4", "segmentation_type": "segmentation", "text": "ུལ་དུ་འཁྲིད་པ་དེ་བཞིན་འདིར་ཡང་བློས། །མིག་ཉམས་ཡོན་ཏན་བླངས་ཏེ་རྒྱལ་ཉིད་འགྲོ། །", "span": {"start": 283, "end": 359}, }, { - "index": 5, + "id": "5", "segmentation_type": "segmentation", "text": "ཇི་ལྟར་དེ་ཡིས་ཆེས་ཟབ་ཆོས་རྟོགས་པ། །ལུང་དང་གཞན་ཡང་རིགས་པས་ཡིན་པས་ན། །དེ་ལྟར་འཕགས་པ་ཀླུ་སྒྲུབ་གཞུང་ལུགས་ལས། །ཇི་ལྟར་གནས་པའི་ལུགས་བཞིན་བརྗོད་པར་བྱ། ", "span": {"start": 360, "end": 505}, }, { - "index": 6, + "id": "6", "segmentation_type": "segmentation", "text": "\nསོ་སོ་སྐྱེ་བོའི་དུས་ནའང་སྟོང་པ་ཉིད་ཐོས་ནས། །ནང་དུ་རབ་ཏུ་དགའ་བ་ཡང་དང་ཡང་དུ་འབྱུང༌། །རབ་ཏུ་དགའ་བ་ལས་བྱུང་མཆི་མས་མིག་བརླན་ཞིང༌། །ལུས་ཀྱི་བ་སྤུ་ལྡང་པར་འགྱུར་པ་གང་ཡིན་པ། །\nདེ་ལ་རྫོགས་པའི་སངས་རྒྱས་བློ་ཡི་ས་བོན་ཡོད། །དེ་ཉིད་ཉེ་བར་བསྟན་པའི་སྣོད་ནི་དེ་ཡིན་ཏེ། །དེ་ལ་དམ་པའི་དོན་གྱི་བདེན་པ་བསྟན་པར་བྱ། །དེ་ལ་དེ་ཡི་རྗེས་སུ་འགྲོ་བའི་ཡོན་ཏན་འབྱུང༌། །\nརྟག་ཏུ་ཚུལ་ཁྲིམས་ཡང་དག་བླངས་ནས་གནས་པར་འག", "span": {"start": 506, "end": 884}, }, { - "index": 7, + "id": "7", "segmentation_type": "segmentation", "text": "ུར། །སྦྱིན་པ་གཏོང་བར་འགྱུར་ཞིང་སྙིང་རྗེ་བསྟེན་པར་བྱེད། །བཟོད་པ་སྒོམ་བྱེད་དེ་ཡི་དགེ་བའང་བྱང་ཆུབ་ཏུ། །འགྲོ་བ་དགྲོལ་བར་བྱ་ཕྱིར་ཡོངས་སུ་བསྔོ་བྱེད་ཅིང༌། །\nརྫོགས་པའི་བྱང་ཆུབ་སེམས་དཔའ་རྣམས་ལ་གུས་པར་བྱེད། །ཟབ་ཅིང་རྒྱ་ཆེའི་ཚུལ་ལ་མཁས་པའི་སྐྱེ་བོས་ནི། །རིམ་གྱིས་རབ་ཏུ་དགའ་བའི་ས་ནི་འཐོབ་འགྱུར་བས།", "span": {"start": 885, "end": 1169}, }, { - "index": 8, + "id": "8", "segmentation_type": "segmentation", "text": "།དེ་ནི་དོན་དུ་གཉེར་བས་ལམ་འདི་མཉན་པར་གྱིས། །", "span": {"start": 1170, "end": 1213}, }, { - "index": 9, + "id": "9", "segmentation_type": "segmentation", "text": "དེ་ཉིད་དེ་ལས་འབྱུང་མིན་གཞན་དག་ལས་ལྟ་ག་ལ་ཞིག །གཉིས་ཀ་ལས་ཀྱང་མ་ཡིན་རྒྱུ་མེད་པར་ནི་ག་ལ་ཡོད། །དེ་ནི་དེ་ལས་འབྱུང་ན་ཡོན་ཏན་འགའ་ཡང་ཡོད་མ་ཡིན། །སྐྱེས་པར་གྱུར་པ་སླར་ཡང་སྐྱེ་བར་རིགས་པའང་མ་ཡིན་ཉིད། །\nསྐྱེ", "span": {"start": 1214, "end": 1407}, }, { - "index": 10, + "id": "10", "segmentation_type": "segmentation", "text": "་ཟིན་སླར་ཡང་སྐྱེ་བར་ཡོངས་སུ་རྟོག་པར་འགྱུར་ན་ནི། །མྱུ་གུ་ལ་སོགས་རྣམས་ཀྱི་སྐྱེ་བ་འདིར་རྙེད་མི་འགྱུར་ཞིང༌། །ས་བོན་སྲིད་མཐར་ཐུག་པར་རབ་ཏུ་སྐྱེ་བ་ཉིད་དུ་འགྱུར། །ཇི་ལྟར་དེ་ཉིད་ཀྱིས་དེ་", "span": {"start": 1408, "end": 1585}, }, ] assert seg_anns == expected_seg_anns - + version_anns = get_anns( ann_store=AnnotationStore(file=str(self.pecha.layer_path / version_path)), include_span=True, diff --git a/tests/pecha/test_annotation.py b/tests/pecha/test_annotation.py index 87868116..bf23821b 100644 --- a/tests/pecha/test_annotation.py +++ b/tests/pecha/test_annotation.py @@ -1,16 +1,10 @@ -import json - import pytest from pydantic import ValidationError from openpecha.pecha.annotations import ( - AnnotationModel, BaseAnnotation, - PechaAlignment, - PechaId, span, ) -from openpecha.pecha.layer import AnnotationType def test_span_end_must_not_be_less_than_start(): @@ -21,362 +15,4 @@ def test_span_end_must_not_be_less_than_start(): def test_annotation_id(): ann = BaseAnnotation(span=span(start=10, end=20)) assert ann.span.start == 10 - assert ann.span.end == 20 - assert ann.metadata is None - - -def test_pechaid_valid(): - pid = PechaId.validate("I1234ABCD") - assert pid == "I1234ABCD" - - -def test_pechaid_invalid(): - with pytest.raises(ValueError): - PechaId.validate("X1234ABCD") - with pytest.raises(ValueError): - PechaId.validate("I1234ABC") # too short - with pytest.raises(ValueError): - PechaId.validate("I1234ABCDE") # too long - with pytest.raises(ValueError): - PechaId.validate("I1234abcD") # lowercase - - -def test_pecha_alignment_fields(): - pa = PechaAlignment(pecha_id="I1234ABCD", alignment_id="align1") - assert pa.pecha_id == "I1234ABCD" - assert pa.alignment_id == "align1" - - -def test_annotation_model_minimal_alignment(): - align = PechaAlignment(pecha_id="I1234ABCD", alignment_id="align1") - am = AnnotationModel( - pecha_id="I1234ABCD", - type=AnnotationType.ALIGNMENT, - document_id="doc1", - path="ann1", - title="Test", - aligned_to=align, - ) - assert am.pecha_id == "I1234ABCD" - assert am.type == AnnotationType.ALIGNMENT - assert am.aligned_to == align - - -def test_annotation_model_minimal_non_alignment(): - am = AnnotationModel( - pecha_id="I1234ABCD", - type=AnnotationType.SEGMENTATION, - document_id="doc1", - path="ann1", - title="Test", - ) - assert am.pecha_id == "I1234ABCD" - assert am.type == AnnotationType.SEGMENTATION - assert am.aligned_to is None - - -def test_annotation_model_with_alignment(): - align = PechaAlignment(pecha_id="I1234ABCD", alignment_id="align1") - am = AnnotationModel( - pecha_id="I1234ABCD", - type=AnnotationType.ALIGNMENT, - document_id="doc1", - path="ann1", - title="Test", - aligned_to=align, - ) - assert am.aligned_to is not None - assert am.aligned_to.pecha_id == "I1234ABCD" - assert am.aligned_to.alignment_id == "align1" - - -def test_annotation_model_invalid_pechaid(): - with pytest.raises(ValidationError): - AnnotationModel( - pecha_id="BADID", - type=AnnotationType.ALIGNMENT, - document_id="doc1", - path="ann1", - title="Test", - ) - - -def test_annotation_model_missing_required(): - with pytest.raises(ValidationError): - AnnotationModel( - pecha_id="I1234ABCD", - type=AnnotationType.ALIGNMENT, - document_id="doc1", - # path missing - title="Test", - ) - - -class TestValidAnnotationModel: - """Tests for valid annotation models in different scenarios.""" - - def test_valid_annotation_minimal(self): - """Test minimal valid annotation with default values.""" - input_data = { - "pecha_id": "I12345678", - "document_id": "DOC123", - "title": "Test Annotation", - "path": "E11/layer.json", - } - - model = AnnotationModel(**input_data) - assert str(model.pecha_id) == "I12345678" - assert model.document_id == "DOC123" - assert model.title == "Test Annotation" - assert model.type == AnnotationType.SEGMENTATION - assert model.aligned_to is None - - def test_valid_annotation_with_type(self): - """Test valid annotation with explicit type.""" - input_data = { - "pecha_id": "I12345678", - "document_id": "DOC123", - "title": "Test Alignment Annotation", - "type": "alignment", - "path": "E11/layer.json", - } - - model = AnnotationModel(**input_data) - assert str(model.pecha_id) == "I12345678" - assert model.document_id == "DOC123" - assert model.title == "Test Alignment Annotation" - assert model.type == AnnotationType.ALIGNMENT - assert model.aligned_to is None - - def test_valid_annotation_with_alignment(self): - """Test valid annotation with alignment information.""" - input_data = { - "pecha_id": "I12345678", - "document_id": "DOC123", - "title": "Test Annotation with Alignment", - "type": "alignment", - "path": "E11/layer.json", - "aligned_to": { - "pecha_id": "I87654321", - "alignment_id": "ALIGN001", - }, - } - - model = AnnotationModel(**input_data) - assert str(model.pecha_id) == "I12345678" - assert model.document_id == "DOC123" - assert model.title == "Test Annotation with Alignment" - assert model.type == AnnotationType.ALIGNMENT - assert model.aligned_to is not None - assert model.model_dump()["aligned_to"]["pecha_id"] == "I87654321" - assert model.model_dump()["aligned_to"]["alignment_id"] == "ALIGN001" - - def test_valid_annotation_from_dict(self): - """Test creating a valid annotation from a dictionary.""" - input_data = { - "pecha_id": "I12345678", - "document_id": "DOC123", - "title": "Test Dict Annotation", - "path": "E11/layer.json", - } - - model = AnnotationModel.model_validate(input_data) - assert str(model.pecha_id) == "I12345678" - assert model.document_id == "DOC123" - assert model.title == "Test Dict Annotation" - assert model.type == AnnotationType.SEGMENTATION - - -class TestInvalidAnnotationModel: - """Tests for invalid annotation models that should raise validation errors.""" - - def test_invalid_pecha_id_format(self): - """Test that invalid pecha_id format raises ValidationError.""" - with pytest.raises(ValidationError) as exc_info: - AnnotationModel( - pecha_id="invalid_id", # Should start with I and contain 8 hex chars - document_id="DOC123", - title="Invalid ID Test", - path="E11/layer.json", - ) - - # Check the specific validation error message - errors = exc_info.value.errors() - assert any( - err["loc"] == ("pecha_id",) - and "PechaId must start with 'I' followed by 8 uppercase hex characters" - in err["msg"] - for err in errors - ) - - def test_missing_document_id(self): - """Test that missing document_id raises ValidationError.""" - with pytest.raises(ValidationError) as exc_info: - AnnotationModel( - pecha_id="I12345678", - # Missing document_id - title="Missing Document ID Test", - ) - - errors = exc_info.value.errors() - assert any("document_id" in str(err) for err in errors) - assert any("Field required" in str(err) for err in errors) - - def test_missing_title(self): - """Test that missing title raises ValidationError.""" - with pytest.raises(ValidationError) as exc_info: - AnnotationModel( - pecha_id="I12345678", - document_id="DOC123", - # Missing title - ) - - errors = exc_info.value.errors() - assert any("title" in str(err) for err in errors) - assert any("Field required" in str(err) for err in errors) - - def test_empty_title(self): - """Test that empty title raises ValidationError.""" - with pytest.raises(ValidationError) as exc_info: - AnnotationModel( - pecha_id="I12345678", - document_id="DOC123", - title="", # Empty title - ) - - errors = exc_info.value.errors() - assert any("title" in str(err) for err in errors) - assert any("min_length" in str(err) for err in errors) - - def test_invalid_document_id(self): - """Test that invalid document_id raises ValidationError.""" - with pytest.raises(ValidationError) as exc_info: - AnnotationModel( - pecha_id="I12345678", - document_id="", # Empty document_id - title="Invalid Document ID Test", - ) - - errors = exc_info.value.errors() - assert any("document_id" in str(err) for err in errors) - assert any("pattern" in str(err) for err in errors) - - def test_invalid_type(self): - """Test that invalid type raises ValidationError.""" - with pytest.raises(ValidationError) as exc_info: - AnnotationModel( - pecha_id="I12345678", - document_id="DOC123", - title="Invalid Type Test", - type="invalid_type", # Not in AnnotationType enum - ) - - errors = exc_info.value.errors() - assert any("type" in str(err) for err in errors) - assert any("enum" in str(err) for err in errors) - - def test_invalid_aligned_to(self): - """Test that invalid aligned_to raises ValidationError.""" - with pytest.raises(ValidationError) as exc_info: - AnnotationModel( - pecha_id="I12345678", - document_id="DOC123", - title="Invalid Alignment Test", - type="Alignment", - path="E11/layer.json", - aligned_to={ - "pecha_id": "invalid_id", # Invalid pecha_id format - "alignment_id": "ALIGN001", - }, - ) - - errors = exc_info.value.errors() - assert any("aligned_to" in str(err) for err in errors) - assert any( - err["loc"] == ("aligned_to", "pecha_id") - and "PechaId must start with 'I' followed by 8 uppercase hex characters" - in err["msg"] - for err in errors - ) - - def test_missing_alignment_id(self): - """Test that missing alignment_id in aligned_to raises ValidationError.""" - with pytest.raises(ValidationError) as exc_info: - AnnotationModel( - pecha_id="I12345678", - document_id="DOC123", - title="Missing Alignment ID Test", - type="alignment", - aligned_to={ - "pecha_id": "I87654321", - # Missing alignment_id - }, - ) - - errors = exc_info.value.errors() - assert any("aligned_to" in str(err) for err in errors) - - -class TestAnnotationModelSerialization: - """Tests for serialization of annotation models.""" - - def test_model_dump(self): - """Test model_dump() produces the expected dictionary.""" - model = AnnotationModel( - pecha_id="I12345678", - document_id="DOC123", - title="Serialization Test", - type="alignment", - path="E11/layer.json", - aligned_to={ - "pecha_id": "I87654321", - "alignment_id": "ALIGN001", - }, - ) - - data = model.model_dump() - assert data["pecha_id"] == "I12345678" # Should be string not nested object - assert data["document_id"] == "DOC123" - assert data["title"] == "Serialization Test" - assert data["type"] == AnnotationType.ALIGNMENT - assert data["path"] == "E11/layer.json" - assert data["aligned_to"]["pecha_id"] == "I87654321" - assert data["aligned_to"]["alignment_id"] == "ALIGN001" - - def test_model_dump_json(self): - """Test model_dump_json() produces valid JSON with expected structure.""" - model = AnnotationModel( - pecha_id="I12345678", - document_id="DOC123", - title="JSON Serialization Test", - path="E11/layer.json", - ) - - json_str = model.model_dump_json() - data = json.loads(json_str) - - assert data["pecha_id"] == "I12345678" - assert data["document_id"] == "DOC123" - assert data["title"] == "JSON Serialization Test" - assert data["type"] == "segmentation" - assert data["path"] == "E11/layer.json" - assert data["aligned_to"] is None - - def test_json_schema(self): - """Test the JSON schema is correctly generated.""" - schema = AnnotationModel.model_json_schema() - - # Check basic schema structure - assert "properties" in schema - assert "pecha_id" in schema["properties"] - assert "document_id" in schema["properties"] - assert "title" in schema["properties"] - assert "type" in schema["properties"] - assert "aligned_to" in schema["properties"] - - # Check required fields - assert "required" in schema - required_fields = schema["required"] - assert "pecha_id" in required_fields - assert "document_id" in required_fields - assert "title" in required_fields + assert ann.span.end == 20 \ No newline at end of file diff --git a/tests/pecha/test_create_pecha.py b/tests/pecha/test_create_pecha.py index 6f12eff6..a711c3e3 100644 --- a/tests/pecha/test_create_pecha.py +++ b/tests/pecha/test_create_pecha.py @@ -1,4 +1,4 @@ -from openpecha.pecha import Pecha, get_anns, get_annotation_type +from openpecha.pecha import Pecha, get_anns from openpecha.utils import read_json, convert_to_base_annotation from pathlib import Path from openpecha.pecha.layer import AnnotationType @@ -9,25 +9,37 @@ def test_create_pecha(): data = read_json("tests/pecha/data/ITEST001.json") annotation = [convert_to_base_annotation(ann) for ann in data["annotation"]] annotation_id = generate_id() - pecha = Pecha.create_pecha(pecha_id=data["pecha_id"], base_text=data["base_text"], annotation_id=annotation_id, annotation=annotation) - + pecha = Pecha.create_pecha(pecha_id=data["pecha_id"], base_text=data["base_text"], annotation_id=annotation_id, annotation=annotation, annotation_type=AnnotationType.ALIGNMENT) + # assert pecha.id == data["pecha_id"] base_name = list(pecha.bases.keys())[0] assert pecha.bases[base_name] == data["base_text"] ann_store, _ = pecha.get_layer_by_ann_type(base_name=base_name, layer_type=AnnotationType.ALIGNMENT) + # ann_store is a list, we need to use the first AnnotationStore created_annotations = get_anns(ann_store[0] if isinstance(ann_store, list) else ann_store, include_span=True) assert len(created_annotations) == len(data["annotation"]) first_created = created_annotations[0] + first_original = data["annotation"][0] assert first_created["span"]["start"] == first_original["span"]["start"] assert first_created["span"]["end"] == first_original["span"]["end"] - assert first_created["index"] == first_original["index"] + assert not first_created.get("index") assert first_created["alignment_index"] == first_original["alignment_index"] + + pecha_annotation = pecha.annotations[0] + assert pecha_annotation["span"]["start"] == first_original["span"]["start"] + assert pecha_annotation["span"]["end"] == first_original["span"]["end"] + assert pecha_annotation.get("id", None) != None + # Check that only the expected keys are present in first_created + expected_keys = {"span", "id"} + actual_keys = set(pecha_annotation.keys()) + assert actual_keys <= expected_keys, f"Unexpected keys found: {actual_keys - expected_keys}" + assert expected_keys <= actual_keys, f"Missing expected keys: {expected_keys - actual_keys}" def test_add(): data = read_json("tests/pecha/data/ITEST001_alignment.json") @@ -36,7 +48,7 @@ def test_add(): base_name = next(iter(pecha.bases)) annotation_id = generate_id() - annotation_id = pecha.add(annotation_id=annotation_id, annotation=annotation) + annotation_id = pecha.add(annotation_id=annotation_id, annotation=annotation, annotation_type=AnnotationType.ALIGNMENT) ann_store, _ = pecha.get_layer_by_ann_type(base_name=base_name, layer_type=AnnotationType.ALIGNMENT) @@ -45,14 +57,15 @@ def test_add(): assert len(created_annotations) == len(data["annotation"]) first_created = created_annotations[0] + first_original = data["annotation"][0] assert first_created["span"]["start"] == first_original["span"]["start"] assert first_created["span"]["end"] == first_original["span"]["end"] - assert first_created["index"] == first_original["index"] + assert not first_created.get("index") assert first_created["alignment_index"] == first_original["alignment_index"] # Clean up - remove the added annotation layer to keep test data clean - ann_type = get_annotation_type(annotation) + ann_type = AnnotationType.ALIGNMENT annotation_layer_file = pecha.layer_path / base_name / f"{ann_type.value}-{annotation_id}.json" if annotation_layer_file.exists(): annotation_layer_file.unlink() \ No newline at end of file diff --git a/tests/pecha/test_update.py b/tests/pecha/test_update.py deleted file mode 100644 index 839f6729..00000000 --- a/tests/pecha/test_update.py +++ /dev/null @@ -1,26 +0,0 @@ -from openpecha.pecha import Pecha -from pathlib import Path -from openpecha.utils import read_json, convert_to_base_annotation -import subprocess -from openpecha.pecha.layer import AnnotationType -from openpecha.pecha import get_anns - - - -pecha = Pecha.from_path(Path(f"tests/pecha/update/data/ID8Sv2ynVKZX8wIt")) -annotation_id = "Tm3Uewnh3ySsvgIE" -annotation = [convert_to_base_annotation(ann) for ann in read_json("tests/pecha/update/data/updated_segmentation.json")] -layer_type = AnnotationType.SEGMENTATION - - -def test_update_annotation(): - updated_pecha = pecha.update_annotation(annotation_id=annotation_id, annotation=annotation, layer_type=layer_type) - assert updated_pecha.id == pecha.id - base_name = list(pecha.bases.keys())[0] - ann_store, _ = pecha.get_layer_by_ann_type(base_name=base_name, layer_type=layer_type) - - created_annotations = get_anns(ann_store[0] if isinstance(ann_store, list) else ann_store, include_span=True) - - assert len(created_annotations) == len(annotation) - subprocess.run("rm -rf tests/pecha/update/data/ID8Sv2ynVKZX8wIt", shell=True) - subprocess.run("cp -r tests/pecha/serializers/json/data/ID8Sv2ynVKZX8wIt tests/pecha/update/data/ID8Sv2ynVKZX8wIt", shell=True) \ No newline at end of file diff --git a/tests/pecha/update/data/ID8Sv2ynVKZX8wIt/layers/26E4/segmentation-Tm3Uewnh3ySsvgIE.json b/tests/pecha/update/data/ID8Sv2ynVKZX8wIt/layers/26E4/segmentation-Tm3Uewnh3ySsvgIE.json deleted file mode 100644 index bce267dc..00000000 --- a/tests/pecha/update/data/ID8Sv2ynVKZX8wIt/layers/26E4/segmentation-Tm3Uewnh3ySsvgIE.json +++ /dev/null @@ -1,1320 +0,0 @@ -{ - "@type": "AnnotationStore", - "@id": "ID8Sv2ynVKZX8wIt", - "resources": [ - { - "@type": "TextResource", - "@id": "26E4", - "@include": "../../base/26E4.txt" - } - ], - "annotationsets": [ - { - "@type": "AnnotationDataSet", - "@id": "segmentation_annotation", - "keys": [ - { - "@type": "DataKey", - "@id": "index" - }, - { - "@type": "DataKey", - "@id": "segmentation_type" - } - ], - "data": [ - { - "@type": "AnnotationData", - "@id": "84A8849AB4", - "key": "index", - "value": { - "@type": "Int", - "value": 1 - } - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "key": "segmentation_type", - "value": { - "@type": "String", - "value": "segmentation" - } - }, - { - "@type": "AnnotationData", - "@id": "222D05AA0A", - "key": "index", - "value": { - "@type": "Int", - "value": 2 - } - }, - { - "@type": "AnnotationData", - "@id": "E3785540BA", - "key": "index", - "value": { - "@type": "Int", - "value": 3 - } - }, - { - "@type": "AnnotationData", - "@id": "418F632528", - "key": "index", - "value": { - "@type": "Int", - "value": 4 - } - }, - { - "@type": "AnnotationData", - "@id": "9E813CA691", - "key": "index", - "value": { - "@type": "Int", - "value": 5 - } - }, - { - "@type": "AnnotationData", - "@id": "95ABD8599D", - "key": "index", - "value": { - "@type": "Int", - "value": 6 - } - }, - { - "@type": "AnnotationData", - "@id": "9AE80BBF3E", - "key": "index", - "value": { - "@type": "Int", - "value": 7 - } - }, - { - "@type": "AnnotationData", - "@id": "EC333836BE", - "key": "index", - "value": { - "@type": "Int", - "value": 8 - } - }, - { - "@type": "AnnotationData", - "@id": "A561344039", - "key": "index", - "value": { - "@type": "Int", - "value": 9 - } - }, - { - "@type": "AnnotationData", - "@id": "DBECC4EF6F", - "key": "index", - "value": { - "@type": "Int", - "value": 10 - } - }, - { - "@type": "AnnotationData", - "@id": "4F48498554", - "key": "index", - "value": { - "@type": "Int", - "value": 11 - } - }, - { - "@type": "AnnotationData", - "@id": "4B3DB7A8E7", - "key": "index", - "value": { - "@type": "Int", - "value": 12 - } - }, - { - "@type": "AnnotationData", - "@id": "3DDFE49DED", - "key": "index", - "value": { - "@type": "Int", - "value": 13 - } - }, - { - "@type": "AnnotationData", - "@id": "B6C64F6BBB", - "key": "index", - "value": { - "@type": "Int", - "value": 14 - } - }, - { - "@type": "AnnotationData", - "@id": "FFAF411C2C", - "key": "index", - "value": { - "@type": "Int", - "value": 15 - } - }, - { - "@type": "AnnotationData", - "@id": "232E82AD4C", - "key": "index", - "value": { - "@type": "Int", - "value": 16 - } - }, - { - "@type": "AnnotationData", - "@id": "F290DD99A9", - "key": "index", - "value": { - "@type": "Int", - "value": 17 - } - }, - { - "@type": "AnnotationData", - "@id": "CE5C4051BA", - "key": "index", - "value": { - "@type": "Int", - "value": 18 - } - }, - { - "@type": "AnnotationData", - "@id": "DE143AE56E", - "key": "index", - "value": { - "@type": "Int", - "value": 19 - } - }, - { - "@type": "AnnotationData", - "@id": "D49C52979C", - "key": "index", - "value": { - "@type": "Int", - "value": 20 - } - }, - { - "@type": "AnnotationData", - "@id": "4DF7691F34", - "key": "index", - "value": { - "@type": "Int", - "value": 21 - } - }, - { - "@type": "AnnotationData", - "@id": "0D98A8BED4", - "key": "index", - "value": { - "@type": "Int", - "value": 22 - } - }, - { - "@type": "AnnotationData", - "@id": "BC498F81BD", - "key": "index", - "value": { - "@type": "Int", - "value": 23 - } - }, - { - "@type": "AnnotationData", - "@id": "CC3CCCF793", - "key": "index", - "value": { - "@type": "Int", - "value": 24 - } - }, - { - "@type": "AnnotationData", - "@id": "6661E11FF6", - "key": "index", - "value": { - "@type": "Int", - "value": 25 - } - }, - { - "@type": "AnnotationData", - "@id": "E38E1956F9", - "key": "index", - "value": { - "@type": "Int", - "value": 26 - } - }, - { - "@type": "AnnotationData", - "@id": "91D6998190", - "key": "index", - "value": { - "@type": "Int", - "value": 27 - } - }, - { - "@type": "AnnotationData", - "@id": "34B618B6DD", - "key": "index", - "value": { - "@type": "Int", - "value": 28 - } - }, - { - "@type": "AnnotationData", - "@id": "10F713FAC8", - "key": "index", - "value": { - "@type": "Int", - "value": 29 - } - }, - { - "@type": "AnnotationData", - "@id": "DE82D2441F", - "key": "index", - "value": { - "@type": "Int", - "value": 30 - } - }, - { - "@type": "AnnotationData", - "@id": "C13C1ACA0C", - "key": "index", - "value": { - "@type": "Int", - "value": 31 - } - }, - { - "@type": "AnnotationData", - "@id": "07FB3155B3", - "key": "index", - "value": { - "@type": "Int", - "value": 32 - } - } - ] - } - ], - "annotations": [ - { - "@type": "Annotation", - "@id": "B07549701B", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 0 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 54 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "84A8849AB4", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "60010AB6CD", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 55 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 110 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "222D05AA0A", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "C8B26CFFA4", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 111 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 175 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "E3785540BA", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "5C519C091F", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 176 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 193 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "418F632528", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "A99CFE3DC3", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 194 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 251 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "9E813CA691", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "BCBF7B0B44", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 252 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 287 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "95ABD8599D", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "74CAE409E9", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 288 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 428 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "9AE80BBF3E", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "2E2FB82547", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 429 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 527 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "EC333836BE", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "43ECDEE9B0", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 528 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 669 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "A561344039", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "AA18B6218B", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 670 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 730 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "DBECC4EF6F", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "418BE00783", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 731 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 861 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "4F48498554", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "DA91DA4BE4", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 862 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 1321 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "4B3DB7A8E7", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "B4245065D7", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 1322 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 1362 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "3DDFE49DED", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "0F1030EE43", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 1363 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 1363 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "B6C64F6BBB", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "97F7C93C66", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 1364 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 1435 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "FFAF411C2C", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "3A307BA7F7", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 1436 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 1516 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "232E82AD4C", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "B8EE2A4C35", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 1517 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 1667 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "F290DD99A9", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "80D90F8088", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 1668 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 1888 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "CE5C4051BA", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "7BBBEC254B", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 1889 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 1976 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "DE143AE56E", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "7CE016E497", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 1977 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 2155 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "D49C52979C", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "CEDCB38435", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 2156 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 2269 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "4DF7691F34", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "8FF1A6F3F3", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 2270 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 2366 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "0D98A8BED4", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "671711C44C", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 2367 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 2527 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "BC498F81BD", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "0CFE1EF97C", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 2528 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 2763 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "CC3CCCF793", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "0BFEA4BBEF", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 2764 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 2821 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "6661E11FF6", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "6D8BDFC8FD", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 2822 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 2923 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "E38E1956F9", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "74E5403E03", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 2924 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 3106 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "91D6998190", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "3F2B818452", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 3107 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 3216 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "34B618B6DD", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "D050E18B90", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 3217 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 3262 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "10F713FAC8", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "4544050EE3", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 3263 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 3305 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "DE82D2441F", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "9853B76832", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 3306 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 3548 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "C13C1ACA0C", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - }, - { - "@type": "Annotation", - "@id": "769E1974BB", - "target": { - "@type": "TextSelector", - "resource": "26E4", - "offset": { - "@type": "Offset", - "begin": { - "@type": "BeginAlignedCursor", - "value": 3549 - }, - "end": { - "@type": "BeginAlignedCursor", - "value": 3606 - } - } - }, - "data": [ - { - "@type": "AnnotationData", - "@id": "07FB3155B3", - "set": "segmentation_annotation" - }, - { - "@type": "AnnotationData", - "@id": "B144C11491", - "set": "segmentation_annotation" - } - ] - } - ] -} \ No newline at end of file diff --git a/tests/test_ids.py b/tests/test_ids.py index 037eee7e..69140a7e 100644 --- a/tests/test_ids.py +++ b/tests/test_ids.py @@ -3,8 +3,6 @@ from openpecha.ids import ( get_annotation_id, get_base_id, - get_id, - get_initial_pecha_id, get_layer_id, get_uuid, ) @@ -16,16 +14,6 @@ def test_get_uuid(): r"^[0-9a-fA-F]{32}$", uuid ), f"UUID {uuid} is not in the correct format" - -def test_get_id(): - prefix = "T" - length = 4 - generated_id = get_id(prefix, length) - assert re.match( - r"^T[0-9A-F]{4}$", generated_id - ), f"ID {generated_id} is not in the correct format" - - def test_get_base_id(): base_id = get_base_id() assert re.match( @@ -40,13 +28,6 @@ def test_get_layer_id(): ), f"Layer ID {layer_id} is not in the correct format" -def test_get_initial_pecha_id(): - initial_pecha_id = get_initial_pecha_id() - assert re.match( - r"^I[0-9A-F]{8}$", initial_pecha_id - ), f"Initial Pecha ID {initial_pecha_id} is not in the correct format" - - def test_get_annotation_id(): ann_id = get_annotation_id() assert len(ann_id) == 10