diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..47ebd5e --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,27 @@ +# To get started with Dependabot version updates, you'll need to specify which +# package ecosystems to update and where the package manifests are located. +# Please see the documentation for all configuration options: +# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file + +version: 2 +updates: + # Enable version updates for GitHub Actions + - package-ecosystem: "github-actions" + groups: + actions: + patterns: + - "*" + directory: "/" + schedule: + # Check for updates to GitHub Actions every week + interval: "weekly" + + # Enable version updates for pre-commit hooks + - package-ecosystem: "pre-commit" + directory: "/" + schedule: + interval: "weekly" + groups: + pre-commit: + patterns: + - "*" diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index 3d48c6f..e46401c 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -6,6 +6,10 @@ on: permissions: contents: read +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: check: runs-on: ubuntu-latest @@ -16,23 +20,27 @@ jobs: steps: - uses: actions/checkout@v5 - - name: Set up Python 3.12 - uses: actions/setup-python@v6 - with: - python-version: "3.12" - name: Install uv - uses: astral-sh/setup-uv@v6 + uses: astral-sh/setup-uv@v8.1.0 with: + python-version: "3.12" enable-cache: true cache-dependency-glob: "pyproject.toml" - name: Install package with check dependencies - run: uv sync --extra check + run: uv sync --group check + + - name: Run Ruff linter + uses: astral-sh/ruff-action@v3 + with: + args: "check --output-format=github" - # check with ruff - - name: Run ruff - run: uv run ruff check + - name: Run Ruff formatter + # NOTE: ruff format does not currently support github output format + run: ruff format --check --diff src tests + # Check formatting even if the previous step failed + if: always() # check docs build - name: Check that documentation builds with no errors or warnings diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 6e42013..2a008cf 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -11,6 +11,8 @@ name: Upload Python Package on: release: types: [published] + # allow manually running on main + workflow_dispatch: permissions: contents: read diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index fc93c1b..806b68f 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -4,6 +4,10 @@ permissions: contents: read id-token: write +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + on: push: branches: @@ -34,23 +38,23 @@ jobs: # use github python action instead of uv to take advantage of caching - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@v6 + uses: astral-sh/setup-uv@v8.1.0 with: python-version: ${{ matrix.python }} cache: 'pip' cache-dependency-path: '**/pyproject.toml' - - name: Install package with dependencies - run: pip install -e ".[test]" + - name: Install package with check dependencies + run: uv sync --group test # for all versions but the one we use for code coverage, run normally - name: Run unit tests without code coverage - run: pytest + run: uv run pytest if: ${{ matrix.python != env.COV_PYTHON_VERSION }} # run code coverage in one version only - name: Run unit tests with code coverage reporting - run: pytest --cov=. + run: uv run pytest --cov=. if: ${{ matrix.python == env.COV_PYTHON_VERSION }} - name: Upload coverage to Codecov diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e49e09d..873c6fd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,22 +1,46 @@ files: \.py repos: + # ruff for linting and formatting python - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.4 + rev: v0.15.12 hooks: - - id: ruff - args: [ --fix, --exit-non-zero-on-fix ] + - id: ruff-check + args: [ --fix, --show-fixes, --exit-non-zero-on-fix ] - id: ruff-format - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.3.0 + rev: v6.0.0 hooks: - id: check-case-conflict + - id: check-merge-conflict - id: check-executables-have-shebangs - id: debug-statements - id: end-of-file-fixer - id: mixed-line-ending - id: trailing-whitespace + - id: check-yaml + - id: name-tests-test + args: [--pytest-test-first] + - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.13.0 + rev: v2.0.0 hooks: - id: mypy additional_dependencies: [numpy] + # yamlfmt for formatting YAML files + - repo: https://github.com/google/yamlfmt + rev: v0.21.0 + hooks: + - id: yamlfmt + # Codespell for spell checking + - repo: https://github.com/codespell-project/codespell + rev: v2.4.2 + hooks: + - id: codespell + additional_dependencies: + - tomli + exclude_types: ["css", "html", "javascript", "json"] + # Validate GitHub Actions workflow files + - repo: https://github.com/mpalmer/action-validator + rev: v0.9.0 + hooks: + - id: action-validator diff --git a/.readthedocs.yaml b/.readthedocs.yaml index a2bd875..250396f 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -11,8 +11,8 @@ sphinx: configuration: docs/conf.py python: - install: - - method: pip - path: . - extra_requirements: - - docs + install: + - method: uv + command: sync + groups: + - docs diff --git a/CHANGELOG.md b/CHANGELOG.md index a2d2155..1635a90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Change Log + +## 0.7 + +- Add parsing to Gregorian date converter; supports month names (full or abbreviated) + in English, French, German, Spanish, Kinyarwanda, Ganda, and Tigrinya +- Add support for parsing Christian liturgical dates (fixed holidays and movable feasts) +- Include Gregorian dates and holidays in omnibus parser +- Updates to pyproject.toml, pre-commit hooks, ruff rules, dependabot configuration + ## 0.6.1 - Updated Read the Docs build to use Ubuntu 24.04 diff --git a/DEVELOPER_NOTES.md b/DEVELOPER_NOTES.md index 6d4918c..6ea99d6 100644 --- a/DEVELOPER_NOTES.md +++ b/DEVELOPER_NOTES.md @@ -44,9 +44,15 @@ source .venv/bin/activate Install an editable version of the local package along with python dependencies needed for testing and development. ```sh -pip install -e ".[dev]" +pip install -e . --group=dev ``` +If using `uv`, use + +```sh + uv sync --group test + ``` + ### Install pre-commit hooks We use [pre-commit](https://pre-commit.com/) for automated checks and consistent formatting. If you're planning to contribute, please install these when you set up your local development. @@ -88,4 +94,19 @@ pip install -e ".[docs]" sphinx-build docs docs/_build ``` -HTML documentation will be generated in `docs/_build/html` \ No newline at end of file +HTML documentation will be generated in `docs/_build/html` + + +### Regenerating multilingual Gregorian month name parse file + +The Gregorian Lark parser includes a script-generated file, which +populates month names based on a list of language codes using the Babel +library. To regenerate, run the script with hatch (which should +be installed globally): + +```sh +hatch run codegen:generate +``` + +When the `.lark` file is modified by the script, it must be committed to git. + diff --git a/README.md b/README.md index c1b662d..6eacf95 100644 --- a/README.md +++ b/README.md @@ -4,12 +4,23 @@ **undate** is a python library for working with uncertain or partially known dates. -> [!WARNING] -> This is beta software and is not yet feature complete! Use with caution and give us feedback. -> Currently `undate` supports parsing and formatting dates in ISO8601, some -> portions of EDTF (Extended Date Time Format), and parsing and conversion for dates in Hebrew Anno Mundi and Islamic Hijri calendars. +> [!NOTE] +> This is beta software; it is still in development and not fully feature complete. If you use it, please let us know and share your feedback. + + +Currently `undate` supports parsing, formatting, and reasoning with dates in varying precision and calendars; dates with different precision and from different original calendars can be used together. Supported formats include: +- portions of EDTF (Extended Date Time Format) +- ISO8601 +- parsing and calendar conversion for dates in Hebrew Anno Mundi and Islamic Hijri calendars +- Gregorian dates with full or abbreviated month names in any order for multiple languages (English, Spanish, French, German, Kinyarwanda, Ganda, Tigrinya) +- Christian liturgical dates (fixed holidays and movable feasts) + +For unambiguous dates, there is an experimental omnibus parser which combines all available dates (bare years are currently assumed to be Gregorian calendar). + +For more about the origin and goals of `undate`, read our 2025 software paper: + +> Rebecca Sutton Koeser, Julia Damerow, Robert Casties, and Cole Crawford. “[Undate: Humanistic Dates for Computation](https://doi.org/10.1017/chr.2025.10006).” _Computational Humanities Research_, August 5, 2025. -_Undate was initially created as part of a [DH-Tech](https://dh-tech.github.io/) hackathon in November 2022._ --- @@ -20,14 +31,14 @@ _Undate was initially created as part of a [DH-Tech](https://dh-tech.github.io/) [![codecov](https://codecov.io/gh/dh-tech/undate-python/branch/main/graph/badge.svg?token=GE7HZE8C9D)](https://codecov.io/gh/dh-tech/undate-python) [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) +Project documentation is [available on ReadTheDocs](https://undate-python.readthedocs.io/en/latest/). + [![All Contributors](https://img.shields.io/badge/all_contributors-5-orange.svg?style=flat-square)](CONTRIBUTORS.md) -Read [Contributors](CONTRIBUTORS.md) for detailed contribution information. - ## Installation _Recommended_: use pip to install the latest published version from PyPI: @@ -242,6 +253,25 @@ Project documentation is [available on ReadTheDocs](https://undate-python.readth For instructions on setting up for local development, see [Developer Notes](DEVELOPER_NOTES.md). +See [Contributors](CONTRIBUTORS.md) for more detailed information about contributors. + + +## Publications & Presentations + +* Rebecca Sutton Koeser, Julia Damerow, Robert Casties, and Cole Crawford. “[Undate: Humanistic Dates for Computation](https://doi.org/10.1017/chr.2025.10006).” Software paper published in _Computational Humanities Research_, August 5, 2025. +* Rebecca Sutton Koeser. “[Undate: Computing with Uncertain and Partially-Unknown Dates](https://doi.org/10.5281/zenodo.17253974).” Computational notebook presented at USRSE’25. October 6, 2025. +* Rebecca Sutton Koeser. “[Undate in Action](https://rlskoeser.github.io/undate-in-action/).” Presentation at [Digital Humanities Tech Symposium](https://dh-tech.github.io/2025/06/04/digital-humanities-tech-symposium-agenda/), DH2025. July 14, 2025. +* Rebecca Sutton Koeser. 2025. “[Undate in Action](https://doi.org/10.63744/SFtXXpIE4ERh).” In Digital Humanities Tech Symposium 2025—Anthology of Computers and the Humanities, edited by Julia Damerow and Rebecca Sutton Koeser, vol. 2. +* Rebecca Sutton Koeser. [Join me for a DHTech hackathon? It’s an un-date!](https://dh-tech.github.io/blog/2023/02/09/hackathon-undate/) DHTech, February 9, 2023. + +## Related Projects + +- ISO8601 date support and dates with unknown years, including duration, adapted from [Shakespeare and Company Project](https://shakespeareandco.princeton.edu/) ([codebase](https://github.com/Princeton-CDH/mep-django)) +- Parsing and calendar conversion for Hebrew Anno Mundi and Islamic Hijri calendars adapted from [Princeton Geniza Project (PGP)](https://geniza.princeton.edu/) ([codebase](https://github.com/Princeton-CDH/geniza/)); improved and verified with data and logic from the [Islamic Scientific Manuscripts Initiative (ISMI)](https://ismi.mpiwg-berlin.mpg.de/) +- Parsing for dates in African languages inspired by work on and partially checked against data from [MasakhaNER](https://github.com/masakhane-io/masakhane-ner) +- Parsing and calendar conversion for Christian liturgical holidays adapted from work on [Hale/Eliot Letters project](https://cdh.princeton.edu/projects/haleeliot-letters/) + + ## License This software is licensed under the [Apache 2.0 License](LICENSE.md). diff --git a/examples/edtf-support.ipynb b/examples/edtf-support.ipynb index a604838..5cc7e20 100644 --- a/examples/edtf-support.ipynb +++ b/examples/edtf-support.ipynb @@ -34,12 +34,12 @@ "### Date\n", "\n", "```\n", - "complete representation: [year][“-”][month][“-”][day]\n", - "Example 1 ‘1985-04-12’ refers to the calendar date 1985 April 12th with day precision.\n", - "reduced precision for year and month: [year][“-”][month]\n", - "Example 2 ‘1985-04’ refers to the calendar month 1985 April with month precision.\n", + "complete representation: [year][-][month][-][day]\n", + "Example 1 1985-04-12 refers to the calendar date 1985 April 12th with day precision.\n", + "reduced precision for year and month: [year][-][month]\n", + "Example 2 1985-04 refers to the calendar month 1985 April with month precision.\n", "reduced precision for year: [year]\n", - "Example 3 ‘1985’ refers to the calendar year 1985 with year precision.\n", + "Example 3 1985 refers to the calendar year 1985 with year precision.\n", "```" ] }, @@ -60,9 +60,9 @@ "metadata": {}, "outputs": [], "source": [ - "import datetime \n", + "import datetime\n", "\n", - "from undate import Undate, UndateInterval, DatePrecision\n", + "from undate import DatePrecision, Undate, UndateInterval\n", "\n", "# Example 1: day\n", "day = Undate.parse(\"1985-04-12\", \"EDTF\")\n", @@ -97,8 +97,8 @@ "metadata": {}, "outputs": [], "source": [ - "from undate.undate import Undate, DatePrecision\n", "from undate.converters.edtf import EDTFDateConverter\n", + "from undate.undate import DatePrecision, Undate\n", "\n", "# set default format to EDTF\n", "Undate.DEFAULT_CONVERTER = \"EDTF\"\n", @@ -140,12 +140,12 @@ "EDTF Level 0 adopts representations of a time interval where both the start and end are dates: start and end date only; that is, both start and duration, and duration and end, are excluded. Time of day is excluded.\n", "\n", "```\n", - " Example 1 ‘1964/2008’ is a time interval with calendar year precision, beginning sometime in 1964 and ending sometime in 2008.\n", - " Example 2 ‘2004-06/2006-08’ is a time interval with calendar month precision, beginning sometime in June 2004 and ending sometime in August of 2006.\n", - " Example 3 ‘2004-02-01/2005-02-08’ is a time interval with calendar day precision, beginning sometime on February 1, 2004 and ending sometime on February 8, 2005.\n", - " Example 4 ‘2004-02-01/2005-02’ is a time interval beginning sometime on February 1, 2004 and ending sometime in February 2005. Since the start endpoint precision (day) is different than that of the end endpoint (month) the precision of the time interval at large is undefined.\n", - " Example 5 ‘2004-02-01/2005’ is a time interval beginning sometime on February 1, 2004 and ending sometime in 2005. The start endpoint has calendar day precision and the end endpoint has calendar year precision. Similar to the previous example, the precision of the time interval at large is undefined.\n", - " Example 6 ‘2005/2006-02’ is a time interval beginning sometime in 2005 and ending sometime in February 2006.\n", + " Example 1 1964/2008 is a time interval with calendar year precision, beginning sometime in 1964 and ending sometime in 2008.\n", + " Example 2 2004-06/2006-08 is a time interval with calendar month precision, beginning sometime in June 2004 and ending sometime in August of 2006.\n", + " Example 3 2004-02-01/2005-02-08 is a time interval with calendar day precision, beginning sometime on February 1, 2004 and ending sometime on February 8, 2005.\n", + " Example 4 2004-02-01/2005-02 is a time interval beginning sometime on February 1, 2004 and ending sometime in February 2005. Since the start endpoint precision (day) is different than that of the end endpoint (month) the precision of the time interval at large is undefined.\n", + " Example 5 2004-02-01/2005 is a time interval beginning sometime on February 1, 2004 and ending sometime in 2005. The start endpoint has calendar day precision and the end endpoint has calendar year precision. Similar to the previous example, the precision of the time interval at large is undefined.\n", + " Example 6 2005/2006-02 is a time interval beginning sometime in 2005 and ending sometime in February 2006.\n", "```" ] }, @@ -179,7 +179,7 @@ "assert isinstance(day_range, UndateInterval)\n", "assert day_range.earliest == Undate(2004, 2, 1)\n", "assert day_range.latest == Undate(2005, 2, 8)\n", - "# Example 4 \n", + "# Example 4\n", "day_month_range = Undate.parse(\"2004-02-01/2005-02\", \"EDTF\")\n", "assert isinstance(day_range, UndateInterval)\n", "assert day_month_range.earliest == Undate(2004, 2, 1)\n", @@ -193,13 +193,13 @@ "assert day_year_range.latest == Undate(2005)\n", "assert day_year_range.earliest.precision == DatePrecision.DAY\n", "assert day_year_range.latest.precision == DatePrecision.YEAR\n", - "# Example 6 \n", + "# Example 6\n", "year_month_range = Undate.parse(\"2005/2006-02\", \"EDTF\")\n", "assert isinstance(year_month_range, UndateInterval)\n", "assert year_month_range.earliest == Undate(2005)\n", "assert year_month_range.latest == Undate(2006, 2)\n", "assert year_month_range.earliest.precision == DatePrecision.YEAR\n", - "assert year_month_range.latest.precision == DatePrecision.MONTH\n" + "assert year_month_range.latest.precision == DatePrecision.MONTH" ] }, { @@ -220,14 +220,24 @@ "# Example 1\n", "assert UndateInterval(Undate(1964), Undate(2008)).format(\"EDTF\") == \"1964/2008\"\n", "# Example 2\n", - "assert UndateInterval(Undate(2004, 6), Undate(2006, 8)).format(\"EDTF\") == \"2004-06/2006-08\"\n", + "assert (\n", + " UndateInterval(Undate(2004, 6), Undate(2006, 8)).format(\"EDTF\") == \"2004-06/2006-08\"\n", + ")\n", "# Example 3\n", - "assert UndateInterval(Undate(2004, 2, 1), Undate(2005, 2, 8)).format(\"EDTF\") == \"2004-02-01/2005-02-08\"\n", - "# Example 4 \n", - "assert UndateInterval(Undate(2004, 2, 1), Undate(2005, 2)).format(\"EDTF\") == \"2004-02-01/2005-02\"\n", + "assert (\n", + " UndateInterval(Undate(2004, 2, 1), Undate(2005, 2, 8)).format(\"EDTF\")\n", + " == \"2004-02-01/2005-02-08\"\n", + ")\n", + "# Example 4\n", + "assert (\n", + " UndateInterval(Undate(2004, 2, 1), Undate(2005, 2)).format(\"EDTF\")\n", + " == \"2004-02-01/2005-02\"\n", + ")\n", "# Example 5\n", - "assert UndateInterval(Undate(2004, 2, 1), Undate(2005)).format(\"EDTF\") == \"2004-02-01/2005\"\n", - "# Example 6 \n", + "assert (\n", + " UndateInterval(Undate(2004, 2, 1), Undate(2005)).format(\"EDTF\") == \"2004-02-01/2005\"\n", + ")\n", + "# Example 6\n", "assert UndateInterval(Undate(2005), Undate(2006, 2)).format(\"EDTF\") == \"2005/2006-02\"" ] }, @@ -248,8 +258,8 @@ "\n", "'Y' may be used at the beginning of the date string to signify that the date is a year, when (and only when) the year exceeds four digits, i.e. for years later than 9999 or earlier than -9999.\n", "```\n", - " Example 1 'Y170000002' is the year 170000002\n", - " Example 2 'Y-170000002' is the year -170000002\n", + " Example 1 Y170000002 is the year 170000002\n", + " Example 2 Y-170000002 is the year -170000002\n", "```\n" ] }, @@ -307,14 +317,14 @@ "The character 'X' may be used in place of one or more rightmost digits to indicate that the value of that digit is unspecified, for the following cases:\n", "```\n", " A year with one or two (rightmost) unspecified digits in a year-only expression (year precision)\n", - " Example 1 ‘201X’\n", - " Example 2 ‘20XX’\n", + " Example 1 201X\n", + " Example 2 20XX\n", " Year specified, month unspecified in a year-month expression (month precision)\n", - " Example 3 ‘2004-XX’\n", + " Example 3 2004-XX\n", " Year and month specified, day unspecified in a year-month-day expression (day precision)\n", - " Example 4 ‘1985-04-XX’ \n", + " Example 4 1985-04-XX\n", " Year specified, day and month unspecified in a year-month-day expression (day precision)\n", - " Example 5 ‘1985-XX-XX’ \n", + " Example 5 1985-XX-XX\n", "```" ] }, @@ -325,7 +335,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Example 1 ‘201X’\n", + "# Example 1 201X\n", "# parse\n", "date = Undate.parse(\"201X\", \"EDTF\")\n", "assert date.year == \"201X\"\n", @@ -336,7 +346,7 @@ "# format\n", "assert str(Undate(\"201X\")) == \"201X\"\n", "\n", - "# Example 2 ‘20XX’\n", + "# Example 2 20XX\n", "# parse\n", "date = Undate.parse(\"20XX\", \"EDTF\")\n", "assert date.year == \"20XX\"\n", @@ -347,7 +357,7 @@ "# format\n", "assert str(Undate(\"20XX\")) == \"20XX\"\n", "\n", - "# Example 3 ‘2004-XX’\n", + "# Example 3 2004-XX\n", "# parse\n", "date = Undate.parse(\"2004-XX\", \"EDTF\")\n", "assert date.year == \"2004\"\n", @@ -359,7 +369,7 @@ "# format\n", "assert str(Undate(2004, \"XX\")) == \"2004-XX\"\n", "\n", - "# Example 4 ‘1985-04-XX’ \n", + "# Example 4 1985-04-XX\n", "# parse\n", "date = Undate.parse(\"1985-04-XX\", \"EDTF\")\n", "assert date.year == \"1985\"\n", @@ -372,7 +382,7 @@ "# format\n", "assert str(Undate(1985, 4, \"XX\")) == \"1985-04-XX\"\n", "\n", - "# Example 5 ‘1985-XX-XX’ \n", + "# Example 5 1985-XX-XX\n", "# parse\n", "date = Undate.parse(\"1985-XX-XX\", \"EDTF\")\n", "assert date.year == \"1985\"\n", @@ -384,7 +394,9 @@ "assert date.latest.month == 12\n", "# earliest/latest possible days\n", "assert date.earliest.day == 1\n", - "assert date.latest.day == 31 # undate guesses maximum month length when month is unknown\n", + "assert (\n", + " date.latest.day == 31\n", + ") # undate guesses maximum month length when month is unknown\n", "# format\n", "assert str(Undate(1985, \"XX\", \"XX\")) == \"1985-XX-XX\"" ] @@ -415,11 +427,11 @@ "`undate` supports open ended time intervals, but does not currently distinguish between null string and double dot.\n", "\n", "\n", - " Example 1 ‘1985-04-12/..’\n", + " Example 1 1985-04-12/..\n", " interval starting at 1985 April 12th with day precision; end open\n", - " Example 2 ‘1985-04/..’\n", + " Example 2 1985-04/..\n", " interval starting at 1985 April with month precision; end open\n", - " Example 3 ‘1985/..’\n", + " Example 3 1985/..\n", " interval starting at year 1985 with year precision; end open\n" ] }, @@ -432,7 +444,7 @@ "source": [ "import datetime\n", "\n", - "# Example 1 ‘1985-04-12/..’\n", + "# Example 1 1985-04-12/..\n", "# parse\n", "interval = Undate.parse(\"1985-04-12/..\", \"EDTF\")\n", "assert isinstance(interval, UndateInterval)\n", @@ -443,7 +455,7 @@ "# NOTE: undate interval does not currently distinguish between double dot and null string\n", "assert str(UndateInterval(Undate(1985, 4, 12), None)) == \"1985-04-12/\"\n", "\n", - "# Example 2 ‘1985-04/..’\n", + "# Example 2 1985-04/..\n", "# parse\n", "interval = Undate.parse(\"1985-04/..\", \"EDTF\")\n", "assert isinstance(interval, UndateInterval)\n", @@ -453,7 +465,7 @@ "# format\n", "assert str(UndateInterval(Undate(1985, 4), None)) == \"1985-04/\"\n", "\n", - "# Example 3 ‘1985/..’\n", + "# Example 3 1985/..\n", "# parse\n", "interval = Undate.parse(\"1985/..\", \"EDTF\")\n", "assert isinstance(interval, UndateInterval)\n", @@ -471,11 +483,11 @@ "source": [ "#### Open start time interval\n", "\n", - " Example 4 ‘../1985-04-12’\n", + " Example 4 ../1985-04-12\n", " interval with open start; ending 1985 April 12th with day precision\n", - " Example 5 ‘../1985-04’\n", + " Example 5 ../1985-04\n", " interval with open start; ending 1985 April with month precision\n", - " Example 6 ‘../1985’\n", + " Example 6 ../1985\n", " interval with open start; ending at year 1985 with year precision" ] }, @@ -486,7 +498,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Example 4 ‘../1985-04-12’\n", + "# Example 4 ../1985-04-12\n", "# parse\n", "interval = Undate.parse(\"../1985-04-12\", \"EDTF\")\n", "assert isinstance(interval, UndateInterval)\n", @@ -497,7 +509,7 @@ "# NOTE: undate interval does not currently distinguish between double dot and null string\n", "assert str(UndateInterval(None, Undate(1985, 4, 12))) == \"../1985-04-12\"\n", "\n", - "# Example 5 ‘../1985-04’\n", + "# Example 5 ../1985-04\n", "# parse\n", "interval = Undate.parse(\"../1985-04\", \"EDTF\")\n", "assert isinstance(interval, UndateInterval)\n", @@ -505,9 +517,17 @@ "assert interval.latest == Undate(1985, 4)\n", "assert interval.latest.precision == DatePrecision.MONTH\n", "# format\n", - "assert str(UndateInterval(None, Undate(1985, 4), )) == \"../1985-04\"\n", - "\n", - "# Example 6 ‘../1985’\n", + "assert (\n", + " str(\n", + " UndateInterval(\n", + " None,\n", + " Undate(1985, 4),\n", + " )\n", + " )\n", + " == \"../1985-04\"\n", + ")\n", + "\n", + "# Example 6 ../1985\n", "# parse\n", "interval = Undate.parse(\"../1985\", \"EDTF\")\n", "assert isinstance(interval, UndateInterval)\n", @@ -525,11 +545,11 @@ "source": [ "#### Time interval with unknown end\n", "\n", - " Example 7 ‘1985-04-12/’\n", + " Example 7 1985-04-12/\n", " interval starting 1985 April 12th with day precision; end unknown\n", - " Example 8 ‘1985-04/’\n", + " Example 8 1985-04/\n", " interval starting 1985 April with month precision; end unknown\n", - " Example 9 ‘1985/’\n", + " Example 9 1985/\n", " interval starting year 1985 with year precision; end unknown\n" ] }, @@ -540,7 +560,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Example 7 ‘1985-04-12/’\n", + "# Example 7 1985-04-12/\n", "# parse\n", "interval = Undate.parse(\"1985-04-12/\", \"EDTF\")\n", "assert isinstance(interval, UndateInterval)\n", @@ -551,7 +571,7 @@ "# NOTE: undate interval does not currently distinguish between double dot and null string\n", "assert str(UndateInterval(Undate(1985, 4, 12), None)) == \"1985-04-12/\"\n", "\n", - "# Example 8 ‘1985-04/’\n", + "# Example 8 1985-04/\n", "# parse\n", "interval = Undate.parse(\"1985-04/\", \"EDTF\")\n", "assert isinstance(interval, UndateInterval)\n", @@ -561,7 +581,7 @@ "# format\n", "assert str(UndateInterval(Undate(1985, 4), None)) == \"1985-04/\"\n", "\n", - "# Example 9 ‘1985/’\n", + "# Example 9 1985/\n", "# parse\n", "interval = Undate.parse(\"1985/\", \"EDTF\")\n", "assert isinstance(interval, UndateInterval)\n", @@ -579,11 +599,11 @@ "source": [ "#### Time interval with unknown start\n", "\n", - " Example 10 ‘/1985-04-12’\n", + " Example 10 /1985-04-12\n", " interval with unknown start; ending 1985 April 12th with day precision\n", - " Example 11 ‘/1985-04’\n", + " Example 11 /1985-04\n", " interval with unknown start; ending 1985 April with month precision\n", - " Example 12 ‘/1985’\n", + " Example 12 /1985\n", " interval with unknown start; ending year 1985 with year precision\n" ] }, @@ -594,7 +614,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Example 10 ‘/1985-04-12’\n", + "# Example 10 /1985-04-12\n", "# parse\n", "interval = Undate.parse(\"/1985-04-12\", \"EDTF\")\n", "assert isinstance(interval, UndateInterval)\n", @@ -605,7 +625,7 @@ "# NOTE: undate interval does not currently distinguish between double dot and null string\n", "assert str(UndateInterval(None, Undate(1985, 4, 12))) == \"../1985-04-12\"\n", "\n", - "# Example 11 ‘/1985-04’\n", + "# Example 11 /1985-04\n", "# parse\n", "interval = Undate.parse(\"/1985-04\", \"EDTF\")\n", "assert isinstance(interval, UndateInterval)\n", @@ -613,9 +633,17 @@ "assert interval.latest == Undate(1985, 4)\n", "assert interval.latest.precision == DatePrecision.MONTH\n", "# format\n", - "assert str(UndateInterval(None, Undate(1985, 4), )) == \"../1985-04\"\n", - "\n", - "# Example 12 ‘/1985’\n", + "assert (\n", + " str(\n", + " UndateInterval(\n", + " None,\n", + " Undate(1985, 4),\n", + " )\n", + " )\n", + " == \"../1985-04\"\n", + ")\n", + "\n", + "# Example 12 /1985\n", "# parse\n", "interval = Undate.parse(\"/1985\", \"EDTF\")\n", "assert isinstance(interval, UndateInterval)\n", @@ -633,7 +661,7 @@ "source": [ "#### Negative calendar year\n", "\n", - " Example 1 ‘-1985’\n", + " Example 1 -1985\n", "\n", "Note: ISO 8601 Part 1 does not support negative year. " ] @@ -645,7 +673,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Example 1 ‘-1985’\n", + "# Example 1 -1985\n", "# parse\n", "neg_year = Undate.parse(\"-1985\", \"EDTF\")\n", "assert neg_year.year == \"-1985\"\n", @@ -672,17 +700,17 @@ "\n", "For level 2 the unspecified digit, 'X', may occur anywhere within a component.\n", "\n", - " Example 1 ‘156X-12-25’\n", + " Example 1 156X-12-25\n", " December 25 sometime during the 1560s\n", - " Example 2 ‘15XX-12-25’\n", + " Example 2 15XX-12-25\n", " December 25 sometime during the 1500s\n", - " Example 3 ‘XXXX-12-XX’\n", + " Example 3 XXXX-12-XX\n", " Some day in December in some year\n", - " Example 4 '1XXX-XX’\n", + " Example 4 1XXX-XX\n", " Some month during the 1000s\n", - " Example 5 ‘1XXX-12’\n", + " Example 5 1XXX-12\n", " Some December during the 1000s\n", - " Example 6 ‘1984-1X’\n", + " Example 6 1984-1X\n", " October, November, or December 1984" ] }, @@ -693,7 +721,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Example 1 ‘156X-12-25’\n", + "# Example 1 156X-12-25\n", "# parse\n", "december = Undate.parse(\"156X-12-25\", \"EDTF\")\n", "assert december.year == \"156X\"\n", @@ -705,7 +733,7 @@ "# format\n", "assert str(Undate(\"156X\", 12, 25)) == \"156X-12-25\"\n", "\n", - "# Example 2 ‘15XX-12-25’\n", + "# Example 2 15XX-12-25\n", "# parse\n", "december = Undate.parse(\"15XX-12-25\", \"EDTF\")\n", "assert december.year == \"15XX\"\n", @@ -717,7 +745,7 @@ "# format\n", "assert str(Undate(\"15XX\", 12, 25)) == \"15XX-12-25\"\n", "\n", - "# Example 3 ‘XXXX-12-XX’\n", + "# Example 3 XXXX-12-XX\n", "# parse\n", "december = Undate.parse(\"XXXX-12-XX\", \"EDTF\")\n", "assert december.year == \"XXXX\"\n", @@ -732,7 +760,7 @@ "# format\n", "assert str(Undate(\"XXXX\", 12, \"XX\")) == \"XXXX-12-XX\"\n", "\n", - "# Example 4 '1XXX-XX’\n", + "# Example 4 1XXX-XX\n", "# parse\n", "some_month = Undate.parse(\"1XXX-XX\", \"EDTF\")\n", "assert some_month.year == \"1XXX\"\n", @@ -743,7 +771,7 @@ "# format\n", "assert str(Undate(\"1XXX\", \"XX\")) == \"1XXX-XX\"\n", "\n", - "# Example 5 ‘1XXX-12’\n", + "# Example 5 1XXX-12\n", "# parse\n", "some_december = Undate.parse(\"1XXX-12\", \"EDTF\")\n", "assert some_december.year == \"1XXX\"\n", @@ -754,7 +782,7 @@ "# format\n", "assert str(Undate(\"1XXX\", 12)) == \"1XXX-12\"\n", "\n", - "# Example 6 ‘1984-1X’\n", + "# Example 6 1984-1X\n", "# parse\n", "late_1984 = Undate.parse(\"1984-1X\", \"EDTF\")\n", "assert late_1984.year == \"1984\"\n", diff --git a/examples/pgp_dates.ipynb b/examples/pgp_dates.ipynb index 43a858c..65ece5e 100644 --- a/examples/pgp_dates.ipynb +++ b/examples/pgp_dates.ipynb @@ -34,20 +34,31 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "67c5532d-ebc4-4e1e-aa64-e6802ed1d971", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/mb/6qm4h4yx3yqdy2bv2sjyp4z00000gp/T/ipykernel_78526/1738353942.py:6: DtypeWarning: Columns (31) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " documents = pd.read_csv(pgp_documents_csv)\n" + ] + } + ], "source": [ "import pandas as pd\n", "\n", - "pgp_documents_csv = \"https://github.com/princetongenizalab/pgp-metadata/raw/main/data/documents.csv\"\n", + "pgp_documents_csv = (\n", + " \"https://github.com/princetongenizalab/pgp-metadata/raw/main/data/documents.csv\"\n", + ")\n", "documents = pd.read_csv(pgp_documents_csv)" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "41dc5a05-a04b-4b6d-acfe-1f7b04849346", "metadata": {}, "outputs": [ @@ -56,16 +67,18 @@ "output_type": "stream", "text": [ "\n", - "Total documents: 35,187\n", - "Documents with dates: 4,451\n", - " date on document: 4,126\n", - " inferred dating: 331\n" + "Total documents: 35,938\n", + "Documents with dates: 6,737\n", + " date on document: 4,729\n", + " inferred dating: 2,040\n" ] } ], "source": [ "# limit to documents with dates\n", - "docs_with_dates = documents[documents.doc_date_standard.notna() | documents.inferred_date_standard.notna()]\n", + "docs_with_dates = documents[\n", + " documents.doc_date_standard.notna() | documents.inferred_date_standard.notna()\n", + "]\n", "docs_with_docdate = documents[documents.doc_date_standard.notna()].copy()\n", "docs_with_inferreddate = documents[documents.inferred_date_standard.notna()]\n", "\n", @@ -78,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "94d6340b-10d0-461b-b745-378ffa1ffcec", "metadata": {}, "outputs": [ @@ -115,7 +128,14 @@ " 449\n", " 1570\n", " Seleucid\n", - " 1259\n", + " 1258-08-31/1259-09-19\n", + " \n", + " \n", + " 15\n", + " 462\n", + " NaN\n", + " NaN\n", + " 1056-06\n", " \n", " \n", " 16\n", @@ -173,13 +193,6 @@ " Seleucid\n", " 1130-10-06/1130-10-15\n", " \n", - " \n", - " 61\n", - " 524\n", - " Thursday, 12 Sivan 4795\n", - " Anno Mundi\n", - " 1035-05-22\n", - " \n", " \n", "\n", "" @@ -187,6 +200,7 @@ "text/plain": [ " pgpid doc_date_original doc_date_calendar \\\n", "5 449 1570 Seleucid \n", + "15 462 NaN NaN \n", "16 463 19 Adar 1427 Seleucid \n", "17 464 Tammuz 1288 Seleucid \n", "23 472 1337 Seleucid \n", @@ -195,10 +209,10 @@ "43 502 Tevet 1548 Seleucid \n", "47 506 Elul 1428 Seleucid \n", "55 516 First decade of Ḥeshvan 1442 Seleucid \n", - "61 524 Thursday, 12 Sivan 4795 Anno Mundi \n", "\n", " doc_date_standard \n", - "5 1259 \n", + "5 1258-08-31/1259-09-19 \n", + "15 1056-06 \n", "16 1116-03-05 \n", "17 0977-06-21/0977-07-19 \n", "23 1025-08-28/1026-09-14 \n", @@ -206,17 +220,18 @@ "41 1188-12-07 \n", "43 1236-11-30/1236-12-28 \n", "47 1117-08-01/1117-08-29 \n", - "55 1130-10-06/1130-10-15 \n", - "61 1035-05-22 " + "55 1130-10-06/1130-10-15 " ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "docs_with_docdate[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']].head(10)" + "docs_with_docdate[\n", + " [\"pgpid\", \"doc_date_original\", \"doc_date_calendar\", \"doc_date_standard\"]\n", + "].head(10)" ] }, { @@ -231,7 +246,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "b9703b47-a7e2-4178-a7da-fb47db11b5b7", "metadata": {}, "outputs": [ @@ -252,27 +267,30 @@ "from lark.visitors import VisitError\n", "\n", "# first, how far can we get with the standard dates? can we parse as edtf and sort, render?\n", - "from undate import Undate \n", + "from undate import Undate\n", + "\n", "\n", "def parse_standard_date(value):\n", " try:\n", " return Undate.parse(value, \"EDTF\")\n", " except VisitError as err:\n", " print(f\"Parse error on {value}: {err}\")\n", - " \n", + "\n", "\n", "# ignore gregorian/julian distinction for now\n", "# from pgp code:\n", "# Julian Thursday, 4 October 1582, being followed by Gregorian Friday, 15 October\n", "# cut off between gregorian/julian dates, in julian days\n", - "#gregorian_start_jd = convertdate.julianday.from_julian(1582, 10, 5)\n", + "# gregorian_start_jd = convertdate.julianday.from_julian(1582, 10, 5)\n", "\n", - "docs_with_docdate['undate_standard'] = docs_with_docdate.doc_date_standard.apply(parse_standard_date)" + "docs_with_docdate[\"undate_standard\"] = docs_with_docdate.doc_date_standard.apply(\n", + " parse_standard_date\n", + ")" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "f49e82a4-b05b-4395-998f-0c9e75729e9f", "metadata": {}, "outputs": [ @@ -306,7 +324,7 @@ " \n", " \n", " \n", - " 3190\n", + " 3181\n", " 3957\n", " middle decade of Adar 1528\n", " Seleucid\n", @@ -314,7 +332,7 @@ " 2025-04-12 20:45:36.603800+00:00\n", " \n", " \n", - " 34437\n", + " 34293\n", " 40006\n", " NaN\n", " NaN\n", @@ -327,15 +345,15 @@ ], "text/plain": [ " pgpid doc_date_original doc_date_calendar \\\n", - "3190 3957 middle decade of Adar 1528 Seleucid \n", - "34437 40006 NaN NaN \n", + "3181 3957 middle decade of Adar 1528 Seleucid \n", + "34293 40006 NaN NaN \n", "\n", " doc_date_standard last_modified \n", - "3190 1217-02-20/1217-02-29 2025-04-12 20:45:36.603800+00:00 \n", - "34437 1747-02-29 2024-08-07 18:24:19.425288+00:00 " + "3181 1217-02-20/1217-02-29 2025-04-12 20:45:36.603800+00:00 \n", + "34293 1747-02-29 2024-08-07 18:24:19.425288+00:00 " ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -345,7 +363,15 @@ "\n", "# this is probably a data error in the original\n", "\n", - "docs_with_docdate[docs_with_docdate.undate_standard.isna()][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'last_modified']]" + "docs_with_docdate[docs_with_docdate.undate_standard.isna()][\n", + " [\n", + " \"pgpid\",\n", + " \"doc_date_original\",\n", + " \"doc_date_calendar\",\n", + " \"doc_date_standard\",\n", + " \"last_modified\",\n", + " ]\n", + "]" ] }, { @@ -358,7 +384,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "2d502575-a2b4-4fce-9f59-6932275dfac2", "metadata": {}, "outputs": [ @@ -366,14 +392,14 @@ "data": { "text/plain": [ "doc_date_calendar\n", - "Seleucid 1604\n", - "Anno Mundi 1147\n", - "Hijrī 884\n", + "Seleucid 1794\n", + "Anno Mundi 1399\n", + "Hijrī 1063\n", "Kharājī 8\n", "Name: count, dtype: int64" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -384,7 +410,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "04e4ffb2-13e7-49cc-913b-2104b61aef16", "metadata": {}, "outputs": [ @@ -424,6 +450,13 @@ " 1035-05-22\n", " \n", " \n", + " 70\n", + " 534\n", + " 3 Adar II 4845\n", + " Anno Mundi\n", + " 1085-03-02\n", + " \n", + " \n", " 90\n", " 561\n", " 10 Nisan 4716\n", @@ -452,6 +485,13 @@ " 1044-08-27/1045-09-13\n", " \n", " \n", + " 174\n", + " 657\n", + " [Tammuz] 4831\n", + " Anno Mundi\n", + " 1071-06-02/1071-06-30\n", + " \n", + " \n", " 177\n", " 660\n", " 22 Sivan 4974\n", @@ -472,20 +512,6 @@ " Anno Mundi\n", " 1051-08-18\n", " \n", - " \n", - " 255\n", - " 750\n", - " Friday, 24 Ḥeshvan 4765\n", - " Anno Mundi\n", - " 1004-11-10\n", - " \n", - " \n", - " 264\n", - " 760\n", - " Thursday, 11 Av 4783\n", - " Anno Mundi\n", - " 1023-08-01\n", - " \n", " \n", "\n", "" @@ -493,25 +519,27 @@ "text/plain": [ " pgpid doc_date_original doc_date_calendar doc_date_standard\n", "61 524 Thursday, 12 Sivan 4795 Anno Mundi 1035-05-22\n", + "70 534 3 Adar II 4845 Anno Mundi 1085-03-02\n", "90 561 10 Nisan 4716 Anno Mundi 0956-03-24\n", "111 582 Thursday, 6 Adar 4996 Anno Mundi 1236-02-14\n", "119 591 Sunday, 29 Tammuz 4898 Anno Mundi 1138-07-10\n", "131 603 4805/4806 Anno Mundi 1044-08-27/1045-09-13\n", + "174 657 [Tammuz] 4831 Anno Mundi 1071-06-02/1071-06-30\n", "177 660 22 Sivan 4974 Anno Mundi 1214-06-01\n", "207 695 Friday, [25] Nisan [4810] Anno Mundi 1050-04-20\n", - "215 703 8 Elul (4)811 Anno Mundi 1051-08-18\n", - "255 750 Friday, 24 Ḥeshvan 4765 Anno Mundi 1004-11-10\n", - "264 760 Thursday, 11 Av 4783 Anno Mundi 1023-08-01" + "215 703 8 Elul (4)811 Anno Mundi 1051-08-18" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# example hebrew dates\n", - "docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']].head(10)" + "docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][\n", + " [\"pgpid\", \"doc_date_original\", \"doc_date_calendar\", \"doc_date_standard\"]\n", + "].head(10)" ] }, { @@ -530,7 +558,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "4d11e583-7c80-44ed-80b1-d0c5b7b7f408", "metadata": {}, "outputs": [ @@ -538,8 +566,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "/var/folders/mb/6qm4h4yx3yqdy2bv2sjyp4z00000gp/T/ipykernel_38072/1200615794.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n", - " hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][docs_with_docdate.doc_date_original.notna()]\n" + "/var/folders/mb/6qm4h4yx3yqdy2bv2sjyp4z00000gp/T/ipykernel_78526/2303123184.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n", + " hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][\n" ] }, { @@ -571,56 +599,69 @@ " \n", " \n", " \n", - " 702\n", + " 698\n", " 1223\n", " Wednesday, 9 Tammuz 4912 AM\n", " Anno Mundi\n", " 1152-06-13\n", " \n", " \n", - " 16698\n", + " 16600\n", " 19975\n", " Sunday, 10 Kislev 5583 AM\n", " Anno Mundi\n", " 1822-11-24\n", " \n", " \n", - " 25415\n", + " 25299\n", " 30550\n", " Tammuz 5537 AM\n", " Anno Mundi\n", " 1777-07-06/1777-08-03\n", " \n", + " \n", + " 35805\n", + " 41550\n", + " 3 Av 5325 AM\n", + " Anno Mundi\n", + " 1565-07-01\n", + " \n", " \n", "\n", "" ], "text/plain": [ " pgpid doc_date_original doc_date_calendar \\\n", - "702 1223 Wednesday, 9 Tammuz 4912 AM Anno Mundi \n", - "16698 19975 Sunday, 10 Kislev 5583 AM Anno Mundi \n", - "25415 30550 Tammuz 5537 AM Anno Mundi \n", + "698 1223 Wednesday, 9 Tammuz 4912 AM Anno Mundi \n", + "16600 19975 Sunday, 10 Kislev 5583 AM Anno Mundi \n", + "25299 30550 Tammuz 5537 AM Anno Mundi \n", + "35805 41550 3 Av 5325 AM Anno Mundi \n", "\n", " doc_date_standard \n", - "702 1152-06-13 \n", - "16698 1822-11-24 \n", - "25415 1777-07-06/1777-08-03 " + "698 1152-06-13 \n", + "16600 1822-11-24 \n", + "25299 1777-07-06/1777-08-03 \n", + "35805 1565-07-01 " ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# how many end with AM ?\n", - "hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][docs_with_docdate.doc_date_original.notna()]\n", - "hebrew_dates[hebrew_dates.doc_date_original.str.endswith(\"AM\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]" + "hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][\n", + " docs_with_docdate.doc_date_original.notna()\n", + "]\n", + "hebrew_dates[hebrew_dates.doc_date_original.str.endswith(\"AM\")][\n", + " [\"pgpid\", \"doc_date_original\", \"doc_date_calendar\", \"doc_date_standard\"]\n", + "]" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "cd1a751a-5299-418f-a3f8-050ab0384354", "metadata": {}, "outputs": [ @@ -653,41 +694,41 @@ " \n", " \n", " \n", - " 1556\n", + " 1120\n", + " 1699\n", + " Adar 52[..]\n", + " Anno Mundi\n", + " 1440-02-05/1539-02-18\n", + " \n", + " \n", + " 1132\n", + " 1711\n", + " 9 Av 13[.]8\n", + " Seleucid\n", + " 0997-07-17/1087-07-13\n", + " \n", + " \n", + " 1551\n", " 2163\n", " first third of Tammuz 500[.]\n", " Anno Mundi\n", " 1244/1249\n", " \n", " \n", - " 1567\n", + " 1562\n", " 2175\n", " End of Sivan 152[.]\n", " Seleucid\n", " 1209/1218\n", " \n", " \n", - " 1753\n", + " 1748\n", " 2460\n", " 13[..]\n", " Seleucid\n", " 988/1088\n", " \n", " \n", - " 2018\n", - " 2745\n", - " 1[.] Kislev 48[..]\n", - " Anno Mundi\n", - " 1039-11-30/1138-11-24\n", - " \n", - " \n", - " 3044\n", - " 3805\n", - " 13[..]\n", - " Seleucid\n", - " 988/1087\n", - " \n", - " \n", " ...\n", " ...\n", " ...\n", @@ -695,35 +736,35 @@ " ...\n", " \n", " \n", - " 30589\n", - " 35955\n", - " 12 Muḥarram 52[.]\n", - " Hijrī\n", - " 1126/1134\n", - " \n", - " \n", - " 31226\n", - " 36738\n", - " 54[.]\n", - " Hijrī\n", - " 1145/1154\n", - " \n", - " \n", - " 32548\n", + " 32412\n", " 38077\n", " 14[...]\n", " Seleucid\n", " 1088-09-19/1188-09-23\n", " \n", " \n", - " 34652\n", + " 32804\n", + " 38478\n", + " 19 Tevet 47[..]\n", + " Anno Mundi\n", + " 0940-01-02/1038-12-19\n", + " \n", + " \n", + " 34173\n", + " 39886\n", + " 4[.]4\n", + " Hijrī\n", + " 1023/1101\n", + " \n", + " \n", + " 34503\n", " 40226\n", " 49[.]\n", " Hijrī\n", " 1096-12-19/1106-09-01\n", " \n", " \n", - " 34760\n", + " 34611\n", " 40335\n", " [4]82[.]\n", " Anno Mundi\n", @@ -731,52 +772,55 @@ " \n", " \n", "\n", - "

66 rows × 4 columns

\n", + "

115 rows × 4 columns

\n", "" ], "text/plain": [ " pgpid doc_date_original doc_date_calendar \\\n", - "1556 2163 first third of Tammuz 500[.] Anno Mundi \n", - "1567 2175 End of Sivan 152[.] Seleucid \n", - "1753 2460 13[..] Seleucid \n", - "2018 2745 1[.] Kislev 48[..] Anno Mundi \n", - "3044 3805 13[..] Seleucid \n", + "1120 1699 Adar 52[..] Anno Mundi \n", + "1132 1711 9 Av 13[.]8 Seleucid \n", + "1551 2163 first third of Tammuz 500[.] Anno Mundi \n", + "1562 2175 End of Sivan 152[.] Seleucid \n", + "1748 2460 13[..] Seleucid \n", "... ... ... ... \n", - "30589 35955 12 Muḥarram 52[.] Hijrī \n", - "31226 36738 54[.] Hijrī \n", - "32548 38077 14[...] Seleucid \n", - "34652 40226 49[.] Hijrī \n", - "34760 40335 [4]82[.] Anno Mundi \n", + "32412 38077 14[...] Seleucid \n", + "32804 38478 19 Tevet 47[..] Anno Mundi \n", + "34173 39886 4[.]4 Hijrī \n", + "34503 40226 49[.] Hijrī \n", + "34611 40335 [4]82[.] Anno Mundi \n", "\n", " doc_date_standard \n", - "1556 1244/1249 \n", - "1567 1209/1218 \n", - "1753 988/1088 \n", - "2018 1039-11-30/1138-11-24 \n", - "3044 988/1087 \n", + "1120 1440-02-05/1539-02-18 \n", + "1132 0997-07-17/1087-07-13 \n", + "1551 1244/1249 \n", + "1562 1209/1218 \n", + "1748 988/1088 \n", "... ... \n", - "30589 1126/1134 \n", - "31226 1145/1154 \n", - "32548 1088-09-19/1188-09-23 \n", - "34652 1096-12-19/1106-09-01 \n", - "34760 1059-09-11/1069-09-18 \n", + "32412 1088-09-19/1188-09-23 \n", + "32804 0940-01-02/1038-12-19 \n", + "34173 1023/1101 \n", + "34503 1096-12-19/1106-09-01 \n", + "34611 1059-09-11/1069-09-18 \n", "\n", - "[66 rows x 4 columns]" + "[115 rows x 4 columns]" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# how many include periods?\n", - "docs_with_docdate[docs_with_docdate.doc_date_original.notna() & docs_with_docdate.doc_date_original.str.contains(\"\\\\.\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]" + "docs_with_docdate[\n", + " docs_with_docdate.doc_date_original.notna()\n", + " & docs_with_docdate.doc_date_original.str.contains(\"\\\\.\")\n", + "][[\"pgpid\", \"doc_date_original\", \"doc_date_calendar\", \"doc_date_standard\"]]" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "9fa8d2ba-6612-4de5-8741-dea177f99412", "metadata": {}, "outputs": [ @@ -809,74 +853,74 @@ " \n", " \n", " \n", - " 635\n", + " 631\n", " 1154\n", " Last decade of Kislev 5004\n", " Anno Mundi\n", " 1243-12\n", " \n", " \n", - " 1172\n", + " 1168\n", " 1750\n", " 11th Tammuz 4767\n", " Anno Mundi\n", " 1007\n", " \n", " \n", - " 1173\n", + " 1169\n", " 1751\n", " Monday, 27th Ṭevet 4797\n", " Anno Mundi\n", " 1037-01-23\n", " \n", " \n", - " 1556\n", + " 1551\n", " 2163\n", " first third of Tammuz 500[.]\n", " Anno Mundi\n", " 1244/1249\n", " \n", " \n", - " 5142\n", + " 5126\n", " 6795\n", " last decade of Tishrei 4991\n", " Anno Mundi\n", " 1230-09-29/1230-10-08\n", " \n", " \n", - " 5223\n", + " 5207\n", " 6892\n", " last decade of Iyyar 4906\n", " Anno Mundi\n", " 1146-05-04/1146-05-13\n", " \n", " \n", - " 5664\n", + " 5646\n", " 7409\n", " last third of Ḥeshvan 4965\n", " Anno Mundi\n", " 1204-10-17/1204-10-25\n", " \n", " \n", - " 5812\n", + " 5794\n", " 7581\n", " middle third of Adar 4876\n", " Anno Mundi\n", " 1116-05\n", " \n", " \n", - " 7024\n", + " 7003\n", " 9068\n", " Last decade of Ṭevet 4898\n", " Anno Mundi\n", " 1138-01\n", " \n", " \n", - " 8638\n", - " 11215\n", - " Middle third of Av 4889\n", + " 7049\n", + " 9120\n", + " Sunday, 5th of Kislev\n", " Anno Mundi\n", - " 1129-07-29/1129-08-07\n", + " 1140-11-17\n", " \n", " \n", "\n", @@ -884,43 +928,47 @@ ], "text/plain": [ " pgpid doc_date_original doc_date_calendar \\\n", - "635 1154 Last decade of Kislev 5004 Anno Mundi \n", - "1172 1750 11th Tammuz 4767 Anno Mundi \n", - "1173 1751 Monday, 27th Ṭevet 4797 Anno Mundi \n", - "1556 2163 first third of Tammuz 500[.] Anno Mundi \n", - "5142 6795 last decade of Tishrei 4991 Anno Mundi \n", - "5223 6892 last decade of Iyyar 4906 Anno Mundi \n", - "5664 7409 last third of Ḥeshvan 4965 Anno Mundi \n", - "5812 7581 middle third of Adar 4876 Anno Mundi \n", - "7024 9068 Last decade of Ṭevet 4898 Anno Mundi \n", - "8638 11215 Middle third of Av 4889 Anno Mundi \n", + "631 1154 Last decade of Kislev 5004 Anno Mundi \n", + "1168 1750 11th Tammuz 4767 Anno Mundi \n", + "1169 1751 Monday, 27th Ṭevet 4797 Anno Mundi \n", + "1551 2163 first third of Tammuz 500[.] Anno Mundi \n", + "5126 6795 last decade of Tishrei 4991 Anno Mundi \n", + "5207 6892 last decade of Iyyar 4906 Anno Mundi \n", + "5646 7409 last third of Ḥeshvan 4965 Anno Mundi \n", + "5794 7581 middle third of Adar 4876 Anno Mundi \n", + "7003 9068 Last decade of Ṭevet 4898 Anno Mundi \n", + "7049 9120 Sunday, 5th of Kislev Anno Mundi \n", "\n", " doc_date_standard \n", - "635 1243-12 \n", - "1172 1007 \n", - "1173 1037-01-23 \n", - "1556 1244/1249 \n", - "5142 1230-09-29/1230-10-08 \n", - "5223 1146-05-04/1146-05-13 \n", - "5664 1204-10-17/1204-10-25 \n", - "5812 1116-05 \n", - "7024 1138-01 \n", - "8638 1129-07-29/1129-08-07 " + "631 1243-12 \n", + "1168 1007 \n", + "1169 1037-01-23 \n", + "1551 1244/1249 \n", + "5126 1230-09-29/1230-10-08 \n", + "5207 1146-05-04/1146-05-13 \n", + "5646 1204-10-17/1204-10-25 \n", + "5794 1116-05 \n", + "7003 1138-01 \n", + "7049 1140-11-17 " ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# how many use ordinals instead of numerals?\n", - "hebrew_dates[hebrew_dates.doc_date_original.str.contains(\"st\") | hebrew_dates.doc_date_original.str.contains(\"rd\") | hebrew_dates.doc_date_original.str.contains(\"th\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']].head(10)" + "hebrew_dates[\n", + " hebrew_dates.doc_date_original.str.contains(\"st\")\n", + " | hebrew_dates.doc_date_original.str.contains(\"rd\")\n", + " | hebrew_dates.doc_date_original.str.contains(\"th\")\n", + "][[\"pgpid\", \"doc_date_original\", \"doc_date_calendar\", \"doc_date_standard\"]].head(10)" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "5b6d5811-fe81-471d-bd29-896cec4c98ff", "metadata": {}, "outputs": [ @@ -937,12 +985,14 @@ "source": [ "import re\n", "\n", + "\n", "def remove_ordinals(val):\n", - " return re.sub(r'(\\d+)(st|nd|rd|th)', \"\\\\1\", val)\n", + " return re.sub(r\"(\\d+)(st|nd|rd|th)\", \"\\\\1\", val)\n", + "\n", "\n", "# test removing ordinals without removing the numbers\n", - "for val in ['11th Tammuz 4767', \"27th Tevet\", \"8th Kislev\"]:\n", - " print(f\"{val}: { remove_ordinals(val)}\")" + "for val in [\"11th Tammuz 4767\", \"27th Tevet\", \"8th Kislev\"]:\n", + " print(f\"{val}: {remove_ordinals(val)}\")" ] }, { @@ -956,7 +1006,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "798da8f2-2332-48c2-aeec-214474e9d49c", "metadata": {}, "outputs": [], @@ -966,7 +1016,7 @@ "from lark.exceptions import UnexpectedEOF\n", "\n", "# set this to True to see details about parsing\n", - "VERBOSE_PARSE_OUTPUT = False \n", + "VERBOSE_PARSE_OUTPUT = False\n", "\n", "\n", "def parse_original_date(row):\n", @@ -980,27 +1030,40 @@ " # handle seleucid as hebrew with offset (adapt from pgp code)\n", " undate_calendar = \"Seleucid\"\n", "\n", - " \n", " if undate_calendar:\n", " value = row.doc_date_original\n", "\n", " # some dates have unknown digits, e.g. 1[.] Kislev 48[..] or 152[.]\n", " # ... the calendar parser don't support this, even though Undate does support unknown digits\n", " # in future, perhaps we can add missing digit logic with this syntax to share across appropriate parsers\n", - " if '[.' in value:\n", + " if \"[.\" in value:\n", " if VERBOSE_PARSE_OUTPUT:\n", " print(f\"ignoring missing digits for now {value}\")\n", - " value = value.replace(\"[.]\", \"0\").replace(\"[..]\", \"00\").replace(\"[...]\", \"000\") \n", - " \n", + " value = (\n", + " value.replace(\"[.]\", \"0\").replace(\"[..]\", \"00\").replace(\"[...]\", \"000\")\n", + " )\n", + "\n", " # some dates have inferred numbers, e.g. Friday, [25] Nisan [4810] or 8 Elul (4)811'\n", - " # for now, just strip out brackets before parsing; \n", + " # for now, just strip out brackets before parsing;\n", " # in future, could potentially infer uncertainty based on these\n", - " value = value.replace('[', '').replace(']', '').replace('(', '').replace(')', '')\n", + " value = (\n", + " value.replace(\"[\", \"\").replace(\"]\", \"\").replace(\"(\", \"\").replace(\")\", \"\")\n", + " )\n", "\n", " # for now, remove modifiers that are not supported by undate parser:\n", " # Late Tevet 4903, Last decade of Kislev 5004, first third of ...\n", " # some dates include of, e.g. day of month\n", - " modifiers = [\"Late \", \"(first|middle|last)( third|half|decade|tenth)? (of )?\", \"(Beginning|end) of \", \"last day\", \"First 10 days\", \" of\", \"spring\", \"decade \", \"night, \"]\n", + " modifiers = [\n", + " \"Late \",\n", + " \"(first|middle|last)( third|half|decade|tenth)? (of )?\",\n", + " \"(Beginning|end) of \",\n", + " \"last day\",\n", + " \"First 10 days\",\n", + " \" of\",\n", + " \"spring\",\n", + " \"decade \",\n", + " \"night, \",\n", + " ]\n", " for mod in modifiers:\n", " value = re.sub(mod, \"\", value, flags=re.I)\n", "\n", @@ -1017,12 +1080,14 @@ "\n", " # about 62 have ordinals; strip them out\n", " value = remove_ordinals(value)\n", - " \n", + "\n", " try:\n", " return Undate.parse(value, undate_calendar)\n", " except (VisitError, ValueError, UnexpectedEOF) as err:\n", " if VERBOSE_PARSE_OUTPUT:\n", - " print(f\"Parse error on PGPID {row.pgpid} {value} ({undate_calendar}): {err}\")\n", + " print(\n", + " f\"Parse error on PGPID {row.pgpid} {value} ({undate_calendar}): {err}\"\n", + " )\n", "\n", " # there are a handful of cases in PGP where calendars are mixed,\n", " # i.e. hebrew months used for hijri calendar\n", @@ -1034,13 +1099,16 @@ " if parsed:\n", " parsed = parsed.as_calendar(undate_calendar)\n", " if VERBOSE_PARSE_OUTPUT:\n", - " print(f\"parsed {value} with ISO8601 format and calendar {undate_calendar}, result is {parsed} ({parsed.earliest}/{parsed.latest})\")\n", + " print(\n", + " f\"parsed {value} with ISO8601 format and calendar {undate_calendar}, result is {parsed} ({parsed.earliest}/{parsed.latest})\"\n", + " )\n", " return parsed\n", " except ValueError as err:\n", " if VERBOSE_PARSE_OUTPUT:\n", " print(f\"Could not parse {value} as ISO date: {err}\")\n", "\n", - "docs_with_docdate['undate_orig'] = docs_with_docdate.apply(parse_original_date, axis=1)" + "\n", + "docs_with_docdate[\"undate_orig\"] = docs_with_docdate.apply(parse_original_date, axis=1)" ] }, { @@ -1055,7 +1123,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "623eb160-ab6c-44ba-b3f4-6770c2c7bd86", "metadata": {}, "outputs": [ @@ -1063,21 +1131,25 @@ "name": "stdout", "output_type": "stream", "text": [ - "original dates parsed: 3462\n", - "original dates unparsed: 173 (anno mundi, hijri, and seleucid calendars)\n", - "proportion parsed: 95.24%\n" + "original dates parsed: 4058\n", + "original dates unparsed: 198 (anno mundi, hijri, and seleucid calendars)\n", + "proportion parsed: 95.35%\n" ] } ], "source": [ "orig_dates_parsed = docs_with_docdate[docs_with_docdate.undate_orig.notna()].copy()\n", - "orig_dates_unparsed = docs_with_docdate[docs_with_docdate.doc_date_original.notna() & docs_with_docdate.doc_date_calendar.isin(['Anno Mundi', 'Hijrī', 'Seleucid']) & docs_with_docdate.undate_orig.isna()] \n", + "orig_dates_unparsed = docs_with_docdate[\n", + " docs_with_docdate.doc_date_original.notna()\n", + " & docs_with_docdate.doc_date_calendar.isin([\"Anno Mundi\", \"Hijrī\", \"Seleucid\"])\n", + " & docs_with_docdate.undate_orig.isna()\n", + "]\n", "\n", "total_parsed = len(orig_dates_parsed)\n", "total_unparsed = len(orig_dates_unparsed)\n", "print(f\"\"\"original dates parsed: {total_parsed}\n", "original dates unparsed: {total_unparsed} (anno mundi, hijri, and seleucid calendars)\n", - "proportion parsed: {(total_parsed/(total_parsed + total_unparsed))*100:0.2f}%\"\"\")" + "proportion parsed: {(total_parsed / (total_parsed + total_unparsed)) * 100:0.2f}%\"\"\")" ] }, { @@ -1092,7 +1164,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "42945787-6788-422d-9a04-f983ec6b31af", "metadata": {}, "outputs": [ @@ -1132,8 +1204,8 @@ " 449\n", " 1570\n", " Seleucid\n", - " 1259\n", - " 1259\n", + " 1258-08-31/1259-09-19\n", + " 1258-08-31/1259-09-19\n", " 1570\n", " year\n", " \n", @@ -1183,34 +1255,46 @@ ], "text/plain": [ " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n", - "5 449 1570 Seleucid 1259 \n", + "5 449 1570 Seleucid 1258-08-31/1259-09-19 \n", "16 463 19 Adar 1427 Seleucid 1116-03-05 \n", "17 464 Tammuz 1288 Seleucid 0977-06-21/0977-07-19 \n", "23 472 1337 Seleucid 1025-08-28/1026-09-14 \n", "41 499 Wednesday, 15 Kislev 1500 Seleucid 1188-12-07 \n", "\n", " undate_standard undate_orig orig_date_precision \n", - "5 1259 1570 year \n", + "5 1258-08-31/1259-09-19 1570 year \n", "16 1116-03-05 1427-12-19 day \n", "17 0977-06-21/0977-07-19 1288-04 month \n", "23 1025-08-28/1026-09-14 1337 year \n", "41 1188-12-07 1500-09-15 day " ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# determine original date precision based on parsed undate\n", - "orig_dates_parsed['orig_date_precision'] = orig_dates_parsed.undate_orig.apply(lambda x: str(x.precision).lower())\n", - "orig_dates_parsed[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate_standard', 'undate_orig', 'orig_date_precision']].head()" + "orig_dates_parsed[\"orig_date_precision\"] = orig_dates_parsed.undate_orig.apply(\n", + " lambda x: str(x.precision).lower()\n", + ")\n", + "orig_dates_parsed[\n", + " [\n", + " \"pgpid\",\n", + " \"doc_date_original\",\n", + " \"doc_date_calendar\",\n", + " \"doc_date_standard\",\n", + " \"undate_standard\",\n", + " \"undate_orig\",\n", + " \"orig_date_precision\",\n", + " ]\n", + "].head()" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "88f1d3ab-e1c7-48b5-8907-5aeea463f1e8", "metadata": {}, "outputs": [ @@ -1218,13 +1302,13 @@ "data": { "text/plain": [ "orig_date_precision\n", - "day 1599\n", - "month 1027\n", - "year 836\n", + "day 1947\n", + "month 1178\n", + "year 933\n", "Name: count, dtype: int64" ] }, - "execution_count": 18, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1246,7 +1330,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "id": "5d3a55b0-ed36-47ba-b022-848bb128b449", "metadata": {}, "outputs": [ @@ -1289,7 +1373,7 @@ " Seleucid\n", " 1570\n", " year\n", - " 1259\n", + " 1258-08-31/1259-09-19\n", " 1258-09-07\n", " 1259-09-26\n", " \n", @@ -1371,27 +1455,27 @@ " 1130-11-10\n", " \n", " \n", + " 56\n", + " 517\n", + " Elul 1351\n", + " Seleucid\n", + " 1351-06\n", + " month\n", + " 1040-08-13/1040-09-10\n", + " 1040-08-19\n", + " 1040-09-16\n", + " \n", + " \n", " 73\n", " 537\n", " Ḥeshvan 1453\n", " Seleucid\n", " 1453-08\n", " month\n", - " 1141\n", + " 1141-10-04/1141-11-01\n", " 1141-10-11\n", " 1141-11-08\n", " \n", - " \n", - " 75\n", - " 544\n", - " Sunday, 21 Kislev 1355\n", - " Seleucid\n", - " 1355-09-21\n", - " day\n", - " 1043-11-26\n", - " 1043-12-02\n", - " 1043-12-02\n", - " \n", " \n", "\n", "" @@ -1406,11 +1490,11 @@ "43 502 Tevet 1548 Seleucid 1548-10 \n", "47 506 Elul 1428 Seleucid 1428-06 \n", "55 516 First decade of Ḥeshvan 1442 Seleucid 1442-08 \n", + "56 517 Elul 1351 Seleucid 1351-06 \n", "73 537 Ḥeshvan 1453 Seleucid 1453-08 \n", - "75 544 Sunday, 21 Kislev 1355 Seleucid 1355-09-21 \n", "\n", " orig_date_precision doc_date_standard undate_earliest undate_latest \n", - "5 year 1259 1258-09-07 1259-09-26 \n", + "5 year 1258-08-31/1259-09-19 1258-09-07 1259-09-26 \n", "16 day 1116-03-05 1116-03-12 1116-03-12 \n", "17 month 0977-06-21/0977-07-19 0977-06-26 0977-07-24 \n", "23 year 1025-08-28/1026-09-14 1025-09-03 1026-09-20 \n", @@ -1418,35 +1502,49 @@ "43 month 1236-11-30/1236-12-28 1236-12-07 1237-01-04 \n", "47 month 1117-08-01/1117-08-29 1117-08-08 1117-09-05 \n", "55 month 1130-10-06/1130-10-15 1130-10-13 1130-11-10 \n", - "73 month 1141 1141-10-11 1141-11-08 \n", - "75 day 1043-11-26 1043-12-02 1043-12-02 " + "56 month 1040-08-13/1040-09-10 1040-08-19 1040-09-16 \n", + "73 month 1141-10-04/1141-11-01 1141-10-11 1141-11-08 " ] }, - "execution_count": 19, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "seleucid_dates = orig_dates_parsed[orig_dates_parsed.doc_date_calendar == 'Seleucid'].copy()\n", - "# add undate earliest/latest (Gregorian) for comparison with dataset standardized date \n", - "seleucid_dates['undate_earliest'] = seleucid_dates.undate_orig.apply(lambda x: x.earliest)\n", - "seleucid_dates['undate_latest'] = seleucid_dates.undate_orig.apply(lambda x: x.latest)\n", + "seleucid_dates = orig_dates_parsed[\n", + " orig_dates_parsed.doc_date_calendar == \"Seleucid\"\n", + "].copy()\n", + "# add undate earliest/latest (Gregorian) for comparison with dataset standardized date\n", + "seleucid_dates[\"undate_earliest\"] = seleucid_dates.undate_orig.apply(\n", + " lambda x: x.earliest\n", + ")\n", + "seleucid_dates[\"undate_latest\"] = seleucid_dates.undate_orig.apply(lambda x: x.latest)\n", "\n", - "seleucid_dates[['pgpid', 'doc_date_original', 'doc_date_calendar', 'undate_orig', 'orig_date_precision', 'doc_date_standard', 'undate_earliest', 'undate_latest']].head(10)\n", - " " + "seleucid_dates[\n", + " [\n", + " \"pgpid\",\n", + " \"doc_date_original\",\n", + " \"doc_date_calendar\",\n", + " \"undate_orig\",\n", + " \"orig_date_precision\",\n", + " \"doc_date_standard\",\n", + " \"undate_earliest\",\n", + " \"undate_latest\",\n", + " ]\n", + "].head(10)" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "id": "a104d772-6c2c-4711-91ec-8cf1f108ae23", "metadata": {}, "outputs": [], "source": [ - "# can we sort by parsed original dates? \n", + "# can we sort by parsed original dates?\n", "# doesn't work currently because of overlapping dates / different granularity\n", - "#orig_dates_parsed.sort_values(by='undate_orig') #, key=lambda col: col.value.earliest)" + "# orig_dates_parsed.sort_values(by='undate_orig') #, key=lambda col: col.value.earliest)" ] }, { @@ -1463,7 +1561,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "id": "c653d928-8fec-4ddc-9abf-ace2f7ca6629", "metadata": {}, "outputs": [], @@ -1472,14 +1570,20 @@ "\n", "# NOTE: we have to cast type to something pandas/altair supports\n", "\n", - "orig_dates_parsed['orig_date_earliest'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest).astype('datetime64[s]')\n", - "orig_dates_parsed['orig_date_latest'] = orig_dates_parsed.undate_orig.apply(lambda x: x.latest).astype('datetime64[s]')\n", - "orig_dates_parsed['orig_date_mid'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest + (x.latest - x.earliest)/2).astype('datetime64[s]')" + "orig_dates_parsed[\"orig_date_earliest\"] = orig_dates_parsed.undate_orig.apply(\n", + " lambda x: x.earliest\n", + ").astype(\"datetime64[s]\")\n", + "orig_dates_parsed[\"orig_date_latest\"] = orig_dates_parsed.undate_orig.apply(\n", + " lambda x: x.latest\n", + ").astype(\"datetime64[s]\")\n", + "orig_dates_parsed[\"orig_date_mid\"] = orig_dates_parsed.undate_orig.apply(\n", + " lambda x: x.earliest + (x.latest - x.earliest) / 2\n", + ").astype(\"datetime64[s]\")" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "id": "91f155fe-d0e6-4ee4-99de-698ac301e3f3", "metadata": {}, "outputs": [ @@ -1577,6 +1681,14 @@ " Seleucid\n", " \n", " \n", + " 56\n", + " 1040-08-19\n", + " 1040-09-16\n", + " 1040-09-02\n", + " 517\n", + " Seleucid\n", + " \n", + " \n", " 61\n", " 1035-05-28\n", " 1035-05-28\n", @@ -1584,14 +1696,6 @@ " 524\n", " Anno Mundi\n", " \n", - " \n", - " 62\n", - " 1034-08-25\n", - " 1034-09-22\n", - " 1034-09-08\n", - " 525\n", - " Hijrī\n", - " \n", " \n", "\n", "" @@ -1606,22 +1710,30 @@ "43 1236-12-07 1237-01-04 1236-12-21 502 Seleucid\n", "47 1117-08-08 1117-09-05 1117-08-22 506 Seleucid\n", "55 1130-10-13 1130-11-10 1130-10-27 516 Seleucid\n", - "61 1035-05-28 1035-05-28 1035-05-28 524 Anno Mundi\n", - "62 1034-08-25 1034-09-22 1034-09-08 525 Hijrī" + "56 1040-08-19 1040-09-16 1040-09-02 517 Seleucid\n", + "61 1035-05-28 1035-05-28 1035-05-28 524 Anno Mundi" ] }, - "execution_count": 22, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "orig_dates_parsed[['orig_date_earliest', 'orig_date_latest', 'orig_date_mid', 'pgpid', 'doc_date_calendar']].head(10)" + "orig_dates_parsed[\n", + " [\n", + " \"orig_date_earliest\",\n", + " \"orig_date_latest\",\n", + " \"orig_date_mid\",\n", + " \"pgpid\",\n", + " \"doc_date_calendar\",\n", + " ]\n", + "].head(10)" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "id": "144b2a4a-81cf-4a6d-a277-3a7910354a77", "metadata": {}, "outputs": [ @@ -1630,23 +1742,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, - "execution_count": 23, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1712,14 +1824,24 @@ "\n", "date_docs_cal = orig_dates_parsed[orig_dates_parsed.doc_date_standard.notna()]\n", "\n", - "dated_docs_cal = date_docs_cal.fillna({'doc_date_calendar': 'Unspecified'})\n", - "dated_docs_cal['midpoint_year'] = dated_docs_cal.orig_date_mid.apply(lambda x: x.year)\n", + "dated_docs_cal = date_docs_cal.fillna({\"doc_date_calendar\": \"Unspecified\"})\n", + "dated_docs_cal[\"midpoint_year\"] = dated_docs_cal.orig_date_mid.apply(lambda x: x.year)\n", "\n", - "orig_dates_calendars_chart = alt.Chart(dated_docs_cal[['pgpid', 'midpoint_year', 'doc_date_calendar']]).mark_area(opacity=0.7).encode(\n", - " x=alt.X('midpoint_year', title=\"Year (midpoint)\", bin=alt.Bin(maxbins=120), axis=alt.Axis(format=\"r\")),\n", - " y=alt.Y('count(pgpid)', title='Documents'),\n", - " color=alt.Y(\"doc_date_calendar\", title=\"Calendar\")\n", - ").properties(width=900, height=200, title=\"Documents by calendar (original date)\")\n", + "orig_dates_calendars_chart = (\n", + " alt.Chart(dated_docs_cal[[\"pgpid\", \"midpoint_year\", \"doc_date_calendar\"]])\n", + " .mark_area(opacity=0.7)\n", + " .encode(\n", + " x=alt.X(\n", + " \"midpoint_year\",\n", + " title=\"Year (midpoint)\",\n", + " bin=alt.Bin(maxbins=120),\n", + " axis=alt.Axis(format=\"r\"),\n", + " ),\n", + " y=alt.Y(\"count(pgpid)\", title=\"Documents\"),\n", + " color=alt.Y(\"doc_date_calendar\", title=\"Calendar\"),\n", + " )\n", + " .properties(width=900, height=200, title=\"Documents by calendar (original date)\")\n", + ")\n", "\n", "orig_dates_calendars_chart" ] @@ -1734,7 +1856,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "id": "4acc9a2b-d403-4f93-b2c5-6fee92ead105", "metadata": {}, "outputs": [ @@ -1743,23 +1865,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, - "execution_count": 24, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1822,29 +1944,46 @@ "source": [ "# graph documents with calendars\n", "\n", + "\n", "def undate_midpoint(value):\n", " # parsed standard date could be an undate or an interval; handle either\n", " if isinstance(value, Undate):\n", " earliest = value.earliest\n", " latest = value.latest\n", - " else: # interval\n", + " else: # interval\n", " earliest = value.earliest.earliest\n", " latest = value.latest.latest\n", - " return earliest + (latest - earliest)/2\n", - " \n", + " return earliest + (latest - earliest) / 2\n", + "\n", "\n", "dated_docs_cal = docs_with_docdate.copy()\n", - "dated_docs_cal = dated_docs_cal.fillna({'doc_date_calendar': 'Unspecified'})\n", + "dated_docs_cal = dated_docs_cal.fillna({\"doc_date_calendar\": \"Unspecified\"})\n", "# get the midpoint from the parsed standard date; convert to supported type\n", - "dated_docs_cal['midpoint'] = dated_docs_cal.undate_standard.apply(lambda x: undate_midpoint(x) if pd.notna(x) else None).astype(\"datetime64[s]\")\n", - "dated_docs_cal['midpoint_year'] = dated_docs_cal.midpoint.apply(lambda x: x.year if pd.notna(x) else None)\n", + "dated_docs_cal[\"midpoint\"] = dated_docs_cal.undate_standard.apply(\n", + " lambda x: undate_midpoint(x) if pd.notna(x) else None\n", + ").astype(\"datetime64[s]\")\n", + "dated_docs_cal[\"midpoint_year\"] = dated_docs_cal.midpoint.apply(\n", + " lambda x: x.year if pd.notna(x) else None\n", + ")\n", "\n", "\n", - "std_dates_calendars_chart = alt.Chart(dated_docs_cal[['pgpid', 'midpoint_year', 'doc_date_calendar']]).mark_area(opacity=0.7).encode(\n", - " x=alt.X('midpoint_year', title=\"Year\", bin=alt.Bin(maxbins=120), axis=alt.Axis(format=\"r\")),\n", - " y=alt.Y('count(pgpid)', title='Documents'),\n", - " color=alt.Y(\"doc_date_calendar\", title=\"Calendar\").scale(domain=['Anno Mundi', 'Hijrī', 'Seleucid', 'Kharājī', 'Unspecified'])\n", - ").properties(width=900, height=200, title=\"Documents by calendar (standard date)\")\n", + "std_dates_calendars_chart = (\n", + " alt.Chart(dated_docs_cal[[\"pgpid\", \"midpoint_year\", \"doc_date_calendar\"]])\n", + " .mark_area(opacity=0.7)\n", + " .encode(\n", + " x=alt.X(\n", + " \"midpoint_year\",\n", + " title=\"Year\",\n", + " bin=alt.Bin(maxbins=120),\n", + " axis=alt.Axis(format=\"r\"),\n", + " ),\n", + " y=alt.Y(\"count(pgpid)\", title=\"Documents\"),\n", + " color=alt.Y(\"doc_date_calendar\", title=\"Calendar\").scale(\n", + " domain=[\"Anno Mundi\", \"Hijrī\", \"Seleucid\", \"Kharājī\", \"Unspecified\"]\n", + " ),\n", + " )\n", + " .properties(width=900, height=200, title=\"Documents by calendar (standard date)\")\n", + ")\n", "\n", "std_dates_calendars_chart" ] @@ -1859,7 +1998,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "id": "4d7c4d5f-636c-42a0-a906-21c67f5781b8", "metadata": {}, "outputs": [ @@ -1868,23 +2007,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, - "execution_count": 25, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1958,7 +2097,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "id": "c5861110-dbd5-4d7a-8ada-acf7cb871aa7", "metadata": {}, "outputs": [ @@ -1967,23 +2106,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, - "execution_count": 26, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "graphable_data = orig_dates_parsed[['orig_date_earliest', 'orig_date_latest', 'orig_date_mid', 'pgpid', 'doc_date_calendar']].copy()\n", + "graphable_data = orig_dates_parsed[\n", + " [\n", + " \"orig_date_earliest\",\n", + " \"orig_date_latest\",\n", + " \"orig_date_mid\",\n", + " \"pgpid\",\n", + " \"doc_date_calendar\",\n", + " ]\n", + "].copy()\n", "# graphable_data['midpoint'] = graphable_data.undate_standard.apply(lambda x: undate_midpoint(x) if pd.notna(x) else None).astype(\"datetime64[s]\")\n", - "graphable_data['midpoint_year'] = graphable_data.orig_date_mid.apply(lambda x: x.year if pd.notna(x) else None)\n", + "graphable_data[\"midpoint_year\"] = graphable_data.orig_date_mid.apply(\n", + " lambda x: x.year if pd.notna(x) else None\n", + ")\n", "\n", "\n", - "bar_chart = alt.Chart(graphable_data).mark_bar(opacity=0.5).encode(\n", - " x=alt.X('orig_date_earliest:T', title=\"original date (range)\"), # , axis=alt.Axis(format=\"r\")),\n", - " x2='orig_date_latest:T',\n", - " y=alt.Y('count(pgpid)', title='Count of Documents')\n", - ").properties(width=1200, height=150)\n", + "bar_chart = (\n", + " alt.Chart(graphable_data)\n", + " .mark_bar(opacity=0.5)\n", + " .encode(\n", + " x=alt.X(\n", + " \"orig_date_earliest:T\", title=\"original date (range)\"\n", + " ), # , axis=alt.Axis(format=\"r\")),\n", + " x2=\"orig_date_latest:T\",\n", + " y=alt.Y(\"count(pgpid)\", title=\"Count of Documents\"),\n", + " )\n", + " .properties(width=1200, height=150)\n", + ")\n", "\n", - "line_chart = alt.Chart(graphable_data).mark_line(opacity=0.6, color=\"green\", interpolate=\"monotone\").encode(\n", - " x=alt.X('orig_date_mid:T', title=\"Year (midpoint)\"),\n", - " y=alt.Y('count(pgpid)', title='Documents')\n", - ").properties(width=1200, height=150)\n", + "line_chart = (\n", + " alt.Chart(graphable_data)\n", + " .mark_line(opacity=0.6, color=\"green\", interpolate=\"monotone\")\n", + " .encode(\n", + " x=alt.X(\"orig_date_mid:T\", title=\"Year (midpoint)\"),\n", + " y=alt.Y(\"count(pgpid)\", title=\"Documents\"),\n", + " )\n", + " .properties(width=1200, height=150)\n", + ")\n", "\n", "(bar_chart & line_chart).properties(title=\"Documents by date (1000-1300)\").interactive()" ] @@ -2075,7 +2236,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "id": "3122a874-bb17-429f-993f-4bf7a76c1a36", "metadata": {}, "outputs": [ @@ -2112,7 +2273,7 @@ " \n", " \n", " \n", - " 851\n", + " 847\n", " 1377\n", " Wednesday night, 28 Sivan 1581\n", " Seleucid\n", @@ -2123,7 +2284,7 @@ " Legal document\n", " \n", " \n", - " 1714\n", + " 1709\n", " 2418\n", " Monday 20 Tevet 1520\n", " Seleucid\n", @@ -2134,7 +2295,7 @@ " Legal document\n", " \n", " \n", - " 1929\n", + " 1923\n", " 2649\n", " Sunday night, 25 Kislev 1444\n", " Seleucid\n", @@ -2145,7 +2306,7 @@ " Legal document\n", " \n", " \n", - " 2013\n", + " 2007\n", " 2739\n", " Wednesday 29th Elul 1354\n", " Seleucid\n", @@ -2156,7 +2317,7 @@ " Legal document\n", " \n", " \n", - " 3257\n", + " 3248\n", " 4026\n", " Wednesday night, 29 Tishrei 1541\n", " Seleucid\n", @@ -2178,7 +2339,7 @@ " ...\n", " \n", " \n", - " 29303\n", + " 29175\n", " 34623\n", " Sunday night, 20 Ṭevet 1578\n", " Seleucid\n", @@ -2189,7 +2350,7 @@ " Legal document\n", " \n", " \n", - " 29924\n", + " 29792\n", " 35264\n", " Wednesday 13 Ṭevet 1526\n", " Seleucid\n", @@ -2200,7 +2361,7 @@ " Legal document\n", " \n", " \n", - " 34008\n", + " 33867\n", " 39564\n", " Monday 16 Tevet 1339\n", " Seleucid\n", @@ -2211,7 +2372,7 @@ " Legal document\n", " \n", " \n", - " 34466\n", + " 34322\n", " 40035\n", " Monday 1st Iyyar 1437\n", " Seleucid\n", @@ -2222,7 +2383,7 @@ " Legal document\n", " \n", " \n", - " 34467\n", + " 34323\n", " 40036\n", " Friday 15 of Adar 1443\n", " Seleucid\n", @@ -2234,59 +2395,72 @@ " \n", " \n", "\n", - "

104 rows × 8 columns

\n", + "

106 rows × 8 columns

\n", "" ], "text/plain": [ " pgpid doc_date_original doc_date_calendar \\\n", - "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n", - "1714 2418 Monday 20 Tevet 1520 Seleucid \n", - "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n", - "2013 2739 Wednesday 29th Elul 1354 Seleucid \n", - "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n", + "847 1377 Wednesday night, 28 Sivan 1581 Seleucid \n", + "1709 2418 Monday 20 Tevet 1520 Seleucid \n", + "1923 2649 Sunday night, 25 Kislev 1444 Seleucid \n", + "2007 2739 Wednesday 29th Elul 1354 Seleucid \n", + "3248 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n", "... ... ... ... \n", - "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n", - "29924 35264 Wednesday 13 Ṭevet 1526 Seleucid \n", - "34008 39564 Monday 16 Tevet 1339 Seleucid \n", - "34466 40035 Monday 1st Iyyar 1437 Seleucid \n", - "34467 40036 Friday 15 of Adar 1443 Seleucid \n", + "29175 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n", + "29792 35264 Wednesday 13 Ṭevet 1526 Seleucid \n", + "33867 39564 Monday 16 Tevet 1339 Seleucid \n", + "34322 40035 Monday 1st Iyyar 1437 Seleucid \n", + "34323 40036 Friday 15 of Adar 1443 Seleucid \n", "\n", " doc_date_standard undate_standard undate_orig orig_date_precision \\\n", - "851 1270 1270 1581-03-28 day \n", - "1714 1208-12-29 1208-12-29 1520-10-20 day \n", - "1929 1133 1133 1444-09-25 day \n", - "2013 1043-09-07 1043-09-07 1354-06-29 day \n", - "3257 1229-09-18 1229-09-18 1541-07-29 day \n", + "847 1270 1270 1581-03-28 day \n", + "1709 1208-12-29 1208-12-29 1520-10-20 day \n", + "1923 1133 1133 1444-09-25 day \n", + "2007 1043-09-07 1043-09-07 1354-06-29 day \n", + "3248 1229-09-18 1229-09-18 1541-07-29 day \n", "... ... ... ... ... \n", - "29303 1266/1267 1266/1267 1578-10-20 day \n", - "29924 1214/1215 1214/1215 1526-10-13 day \n", - "34008 1027-12-18 1027-12-18 1339-10-16 day \n", - "34466 1126-04-26 1126-04-26 1437-02-01 day \n", - "34467 1132-03-04 1132-03-04 1443-12-15 day \n", + "29175 1266/1267 1266/1267 1578-10-20 day \n", + "29792 1214/1215 1214/1215 1526-10-13 day \n", + "33867 1027-12-18 1027-12-18 1339-10-16 day \n", + "34322 1126-04-26 1126-04-26 1437-02-01 day \n", + "34323 1132-03-04 1132-03-04 1443-12-15 day \n", "\n", " type \n", - "851 Legal document \n", - "1714 Legal document \n", - "1929 Legal document \n", - "2013 Legal document \n", - "3257 Legal document \n", + "847 Legal document \n", + "1709 Legal document \n", + "1923 Legal document \n", + "2007 Legal document \n", + "3248 Legal document \n", "... ... \n", - "29303 Legal document \n", - "29924 Legal document \n", - "34008 Legal document \n", - "34466 Legal document \n", - "34467 Legal document \n", + "29175 Legal document \n", + "29792 Legal document \n", + "33867 Legal document \n", + "34322 Legal document \n", + "34323 Legal document \n", "\n", - "[104 rows x 8 columns]" + "[106 rows x 8 columns]" ] }, - "execution_count": 27, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "weekday_dates = orig_dates_parsed[orig_dates_parsed.doc_date_original.str.contains('day ')][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate_standard', 'undate_orig', 'orig_date_precision', 'type']]\n", + "weekday_dates = orig_dates_parsed[\n", + " orig_dates_parsed.doc_date_original.str.contains(\"day \")\n", + "][\n", + " [\n", + " \"pgpid\",\n", + " \"doc_date_original\",\n", + " \"doc_date_calendar\",\n", + " \"doc_date_standard\",\n", + " \"undate_standard\",\n", + " \"undate_orig\",\n", + " \"orig_date_precision\",\n", + " \"type\",\n", + " ]\n", + "]\n", "weekday_dates" ] }, @@ -2302,7 +2476,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "id": "3e4ea50c-b11c-433b-b6f9-691098b057d3", "metadata": {}, "outputs": [ @@ -2342,7 +2516,7 @@ " \n", " \n", " \n", - " 851\n", + " 847\n", " 1377\n", " Wednesday night, 28 Sivan 1581\n", " Seleucid\n", @@ -2356,7 +2530,7 @@ " Thursday\n", " \n", " \n", - " 1714\n", + " 1709\n", " 2418\n", " Monday 20 Tevet 1520\n", " Seleucid\n", @@ -2370,7 +2544,7 @@ " Monday\n", " \n", " \n", - " 1929\n", + " 1923\n", " 2649\n", " Sunday night, 25 Kislev 1444\n", " Seleucid\n", @@ -2384,7 +2558,7 @@ " Monday\n", " \n", " \n", - " 2013\n", + " 2007\n", " 2739\n", " Wednesday 29th Elul 1354\n", " Seleucid\n", @@ -2398,7 +2572,7 @@ " Wednesday\n", " \n", " \n", - " 3257\n", + " 3248\n", " 4026\n", " Wednesday night, 29 Tishrei 1541\n", " Seleucid\n", @@ -2426,7 +2600,7 @@ " ...\n", " \n", " \n", - " 29303\n", + " 29175\n", " 34623\n", " Sunday night, 20 Ṭevet 1578\n", " Seleucid\n", @@ -2440,7 +2614,7 @@ " Monday\n", " \n", " \n", - " 29924\n", + " 29792\n", " 35264\n", " Wednesday 13 Ṭevet 1526\n", " Seleucid\n", @@ -2454,7 +2628,7 @@ " Wednesday\n", " \n", " \n", - " 34008\n", + " 33867\n", " 39564\n", " Monday 16 Tevet 1339\n", " Seleucid\n", @@ -2468,7 +2642,7 @@ " Monday\n", " \n", " \n", - " 34466\n", + " 34322\n", " 40035\n", " Monday 1st Iyyar 1437\n", " Seleucid\n", @@ -2482,7 +2656,7 @@ " Monday\n", " \n", " \n", - " 34467\n", + " 34323\n", " 40036\n", " Friday 15 of Adar 1443\n", " Seleucid\n", @@ -2497,53 +2671,53 @@ " \n", " \n", "\n", - "

104 rows × 11 columns

\n", + "

106 rows × 11 columns

\n", "" ], "text/plain": [ " pgpid doc_date_original doc_date_calendar \\\n", - "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n", - "1714 2418 Monday 20 Tevet 1520 Seleucid \n", - "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n", - "2013 2739 Wednesday 29th Elul 1354 Seleucid \n", - "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n", + "847 1377 Wednesday night, 28 Sivan 1581 Seleucid \n", + "1709 2418 Monday 20 Tevet 1520 Seleucid \n", + "1923 2649 Sunday night, 25 Kislev 1444 Seleucid \n", + "2007 2739 Wednesday 29th Elul 1354 Seleucid \n", + "3248 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n", "... ... ... ... \n", - "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n", - "29924 35264 Wednesday 13 Ṭevet 1526 Seleucid \n", - "34008 39564 Monday 16 Tevet 1339 Seleucid \n", - "34466 40035 Monday 1st Iyyar 1437 Seleucid \n", - "34467 40036 Friday 15 of Adar 1443 Seleucid \n", + "29175 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n", + "29792 35264 Wednesday 13 Ṭevet 1526 Seleucid \n", + "33867 39564 Monday 16 Tevet 1339 Seleucid \n", + "34322 40035 Monday 1st Iyyar 1437 Seleucid \n", + "34323 40036 Friday 15 of Adar 1443 Seleucid \n", "\n", " doc_date_standard undate_standard undate_orig orig_date_precision \\\n", - "851 1270 1270 1581-03-28 day \n", - "1714 1208-12-29 1208-12-29 1520-10-20 day \n", - "1929 1133 1133 1444-09-25 day \n", - "2013 1043-09-07 1043-09-07 1354-06-29 day \n", - "3257 1229-09-18 1229-09-18 1541-07-29 day \n", + "847 1270 1270 1581-03-28 day \n", + "1709 1208-12-29 1208-12-29 1520-10-20 day \n", + "1923 1133 1133 1444-09-25 day \n", + "2007 1043-09-07 1043-09-07 1354-06-29 day \n", + "3248 1229-09-18 1229-09-18 1541-07-29 day \n", "... ... ... ... ... \n", - "29303 1266/1267 1266/1267 1578-10-20 day \n", - "29924 1214/1215 1214/1215 1526-10-13 day \n", - "34008 1027-12-18 1027-12-18 1339-10-16 day \n", - "34466 1126-04-26 1126-04-26 1437-02-01 day \n", - "34467 1132-03-04 1132-03-04 1443-12-15 day \n", + "29175 1266/1267 1266/1267 1578-10-20 day \n", + "29792 1214/1215 1214/1215 1526-10-13 day \n", + "33867 1027-12-18 1027-12-18 1339-10-16 day \n", + "34322 1126-04-26 1126-04-26 1437-02-01 day \n", + "34323 1132-03-04 1132-03-04 1443-12-15 day \n", "\n", " type undate_weekday undate_weekday_name orig_weekday \n", - "851 Legal document 3 Thursday Thursday \n", - "1714 Legal document 0 Monday Monday \n", - "1929 Legal document 0 Monday Monday \n", - "2013 Legal document 2 Wednesday Wednesday \n", - "3257 Legal document 3 Thursday Thursday \n", + "847 Legal document 3 Thursday Thursday \n", + "1709 Legal document 0 Monday Monday \n", + "1923 Legal document 0 Monday Monday \n", + "2007 Legal document 2 Wednesday Wednesday \n", + "3248 Legal document 3 Thursday Thursday \n", "... ... ... ... ... \n", - "29303 Legal document 0 Monday Monday \n", - "29924 Legal document 2 Wednesday Wednesday \n", - "34008 Legal document 0 Monday Monday \n", - "34466 Legal document 0 Monday Monday \n", - "34467 Legal document 4 Friday Friday \n", + "29175 Legal document 0 Monday Monday \n", + "29792 Legal document 2 Wednesday Wednesday \n", + "33867 Legal document 0 Monday Monday \n", + "34322 Legal document 0 Monday Monday \n", + "34323 Legal document 4 Friday Friday \n", "\n", - "[104 rows x 11 columns]" + "[106 rows x 11 columns]" ] }, - "execution_count": 28, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -2552,23 +2726,38 @@ "days = [\"Monday\", \"Tuesday\", \"Wednesday\", \"Thursday\", \"Friday\", \"Saturday\", \"Sunday\"]\n", "\n", "# get numeric weekday; since these dates are all day-precision we can just use the earliest date\n", - "weekday_dates['undate_weekday'] = weekday_dates.undate_orig.apply(lambda x: x.earliest.weekday)\n", - "weekday_dates['undate_weekday_name'] = weekday_dates.undate_weekday.apply(lambda x: days[x])\n", + "weekday_dates[\"undate_weekday\"] = weekday_dates.undate_orig.apply(\n", + " lambda x: x.earliest.weekday\n", + ")\n", + "weekday_dates[\"undate_weekday_name\"] = weekday_dates.undate_weekday.apply(\n", + " lambda x: days[x]\n", + ")\n", "# extract weekday from date label\n", - "weekday_dates['orig_weekday'] = weekday_dates.doc_date_original.str.extract('([a-zA-Z]+day)', expand=False).str.strip()\n", + "weekday_dates[\"orig_weekday\"] = weekday_dates.doc_date_original.str.extract(\n", + " \"([a-zA-Z]+day)\", expand=False\n", + ").str.strip()\n", "# correct misspellings\n", "misspelled_days = {\n", " \"Wedensday\": \"Wednesday\",\n", " \"Thrusday\": \"Thursday\",\n", "}\n", - "weekday_dates['orig_weekday'] = weekday_dates.orig_weekday.apply(lambda x: misspelled_days.get(x, x))\n", + "weekday_dates[\"orig_weekday\"] = weekday_dates.orig_weekday.apply(\n", + " lambda x: misspelled_days.get(x, x)\n", + ")\n", + "\n", "\n", "# shift night to next day, e.g. Wednesday night should be Thursday\n", "# NOTE: this must be done immediately after the day extraction, otherwise repeated runs continue shifting to the next day\n", "def next_day(weekday):\n", - " return days[(days.index(weekday) +1) % 7]\n", + " return days[(days.index(weekday) + 1) % 7]\n", "\n", - "weekday_dates['orig_weekday'] = weekday_dates.apply(lambda row: next_day(row.orig_weekday) if \" night\" in row.doc_date_original else row.orig_weekday, axis=1)\n", + "\n", + "weekday_dates[\"orig_weekday\"] = weekday_dates.apply(\n", + " lambda row: next_day(row.orig_weekday)\n", + " if \" night\" in row.doc_date_original\n", + " else row.orig_weekday,\n", + " axis=1,\n", + ")\n", "\n", "weekday_dates" ] @@ -2583,7 +2772,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "id": "4ced7809-1414-44ae-aae7-c2d0d1dee9ad", "metadata": {}, "outputs": [ @@ -2623,7 +2812,7 @@ " \n", " \n", " \n", - " 851\n", + " 847\n", " 1377\n", " Wednesday night, 28 Sivan 1581\n", " Seleucid\n", @@ -2637,7 +2826,7 @@ " Thursday\n", " \n", " \n", - " 1929\n", + " 1923\n", " 2649\n", " Sunday night, 25 Kislev 1444\n", " Seleucid\n", @@ -2651,7 +2840,7 @@ " Monday\n", " \n", " \n", - " 3257\n", + " 3248\n", " 4026\n", " Wednesday night, 29 Tishrei 1541\n", " Seleucid\n", @@ -2665,7 +2854,7 @@ " Thursday\n", " \n", " \n", - " 5511\n", + " 5493\n", " 7237\n", " Tuesday night, 22 Kislev 1435\n", " Seleucid\n", @@ -2679,21 +2868,7 @@ " Wednesday\n", " \n", " \n", - " 5854\n", - " 7637\n", - " Monday night, 29 Ṭevet 1438\n", - " Seleucid\n", - " 1127\n", - " 1127\n", - " 1438-10-29\n", - " day\n", - " Legal document\n", - " 4\n", - " Friday\n", - " Tuesday\n", - " \n", - " \n", - " 5857\n", + " 5839\n", " 7642\n", " Thursday night, 23 Tammuz 1538\n", " Seleucid\n", @@ -2707,7 +2882,7 @@ " Friday\n", " \n", " \n", - " 6419\n", + " 6400\n", " 8332\n", " Friday night, 20 Iyar 4957\n", " Anno Mundi\n", @@ -2721,7 +2896,7 @@ " Saturday\n", " \n", " \n", - " 29303\n", + " 29175\n", " 34623\n", " Sunday night, 20 Ṭevet 1578\n", " Seleucid\n", @@ -2740,37 +2915,34 @@ ], "text/plain": [ " pgpid doc_date_original doc_date_calendar \\\n", - "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n", - "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n", - "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n", - "5511 7237 Tuesday night, 22 Kislev 1435 Seleucid \n", - "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid \n", - "5857 7642 Thursday night, 23 Tammuz 1538 Seleucid \n", - "6419 8332 Friday night, 20 Iyar 4957 Anno Mundi \n", - "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n", + "847 1377 Wednesday night, 28 Sivan 1581 Seleucid \n", + "1923 2649 Sunday night, 25 Kislev 1444 Seleucid \n", + "3248 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n", + "5493 7237 Tuesday night, 22 Kislev 1435 Seleucid \n", + "5839 7642 Thursday night, 23 Tammuz 1538 Seleucid \n", + "6400 8332 Friday night, 20 Iyar 4957 Anno Mundi \n", + "29175 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n", "\n", " doc_date_standard undate_standard undate_orig orig_date_precision \\\n", - "851 1270 1270 1581-03-28 day \n", - "1929 1133 1133 1444-09-25 day \n", - "3257 1229-09-18 1229-09-18 1541-07-29 day \n", - "5511 1123-12-12 1123-12-12 1435-09-22 day \n", - "5854 1127 1127 1438-10-29 day \n", - "5857 1227-07-09 1227-07-09 1538-04-23 day \n", - "6419 1197-05 1197-05 4957-02-20 day \n", - "29303 1266/1267 1266/1267 1578-10-20 day \n", + "847 1270 1270 1581-03-28 day \n", + "1923 1133 1133 1444-09-25 day \n", + "3248 1229-09-18 1229-09-18 1541-07-29 day \n", + "5493 1123-12-12 1123-12-12 1435-09-22 day \n", + "5839 1227-07-09 1227-07-09 1538-04-23 day \n", + "6400 1197-05 1197-05 4957-02-20 day \n", + "29175 1266/1267 1266/1267 1578-10-20 day \n", "\n", " type undate_weekday undate_weekday_name orig_weekday \n", - "851 Legal document 3 Thursday Thursday \n", - "1929 Legal document 0 Monday Monday \n", - "3257 Legal document 3 Thursday Thursday \n", - "5511 Legal document 2 Wednesday Wednesday \n", - "5854 Legal document 4 Friday Tuesday \n", - "5857 Legal document 4 Friday Friday \n", - "6419 Legal document 5 Saturday Saturday \n", - "29303 Legal document 0 Monday Monday " + "847 Legal document 3 Thursday Thursday \n", + "1923 Legal document 0 Monday Monday \n", + "3248 Legal document 3 Thursday Thursday \n", + "5493 Legal document 2 Wednesday Wednesday \n", + "5839 Legal document 4 Friday Friday \n", + "6400 Legal document 5 Saturday Saturday \n", + "29175 Legal document 0 Monday Monday " ] }, - "execution_count": 29, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -2789,7 +2961,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "id": "fedb5323-0e9c-476e-a7e2-95443d2f9e1d", "metadata": {}, "outputs": [ @@ -2797,7 +2969,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "44 matches, 60 mismatches (42.31%)\n" + "46 matches, 60 mismatches (43.40%)\n" ] }, { @@ -2836,7 +3008,7 @@ " \n", " \n", " \n", - " 5271\n", + " 5255\n", " 6947\n", " Monday 3 Iyyar 1740\n", " Seleucid\n", @@ -2850,21 +3022,7 @@ " Monday\n", " \n", " \n", - " 5854\n", - " 7637\n", - " Monday night, 29 Ṭevet 1438\n", - " Seleucid\n", - " 1127\n", - " 1127\n", - " 1438-10-29\n", - " day\n", - " Legal document\n", - " 4\n", - " Friday\n", - " Tuesday\n", - " \n", - " \n", - " 8648\n", + " 8624\n", " 11227\n", " Monday 24 Jumādā I 517\n", " Hijrī\n", @@ -2878,7 +3036,7 @@ " Monday\n", " \n", " \n", - " 16397\n", + " 16299\n", " 19649\n", " Thursday 26 Iyyar 5306\n", " Anno Mundi\n", @@ -2892,7 +3050,7 @@ " Thursday\n", " \n", " \n", - " 17723\n", + " 17622\n", " 21094\n", " Saturday 20 Rajab 550\n", " Hijrī\n", @@ -2906,7 +3064,7 @@ " Saturday\n", " \n", " \n", - " 23099\n", + " 22986\n", " 27479\n", " Tuesday 11 Tammuz 5525\n", " Anno Mundi\n", @@ -2920,12 +3078,12 @@ " Tuesday\n", " \n", " \n", - " 23104\n", + " 22991\n", " 27484\n", - " Friday 20th Shevat 5405\n", + " Friday 20 Shevat 5405\n", " Anno Mundi\n", - " 1645\n", - " 1645\n", + " 1645-02-16\n", + " 1645-02-16\n", " 5405-11-20\n", " day\n", " Legal document\n", @@ -2934,7 +3092,7 @@ " Friday\n", " \n", " \n", - " 23105\n", + " 22992\n", " 27485\n", " Sunday 22 Adar 5590\n", " Anno Mundi\n", @@ -2948,7 +3106,7 @@ " Sunday\n", " \n", " \n", - " 23107\n", + " 22994\n", " 27487\n", " Thursday 15 Shevat 5450\n", " Anno Mundi\n", @@ -2962,7 +3120,7 @@ " Thursday\n", " \n", " \n", - " 23109\n", + " 22996\n", " 27489\n", " Sunday 6 Nisan 5528\n", " Anno Mundi\n", @@ -2976,12 +3134,12 @@ " Sunday\n", " \n", " \n", - " 23110\n", + " 22997\n", " 27490\n", - " Thursday 19th Elul 5428\n", + " Thursday 19 Elul 5428\n", " Anno Mundi\n", - " 1668\n", - " 1668\n", + " 1668-08-26\n", + " 1668-08-26\n", " 5428-06-19\n", " day\n", " Legal document\n", @@ -2990,7 +3148,7 @@ " Thursday\n", " \n", " \n", - " 23111\n", + " 22998\n", " 27491\n", " Tuesday 1 Kislev 5507\n", " Anno Mundi\n", @@ -3004,7 +3162,7 @@ " Tuesday\n", " \n", " \n", - " 23116\n", + " 23003\n", " 27496\n", " Sunday 28 Elul 5511\n", " Anno Mundi\n", @@ -3018,12 +3176,12 @@ " Sunday\n", " \n", " \n", - " 23117\n", + " 23004\n", " 27497\n", - " Sunday 17th Sivan 5423\n", + " Sunday 17 Sivan 5423\n", " Anno Mundi\n", - " 1663\n", - " 1663\n", + " 1663-06-22\n", + " 1663-06-22\n", " 5423-03-17\n", " day\n", " Legal document\n", @@ -3032,12 +3190,12 @@ " Sunday\n", " \n", " \n", - " 23118\n", + " 23005\n", " 27498\n", - " Sunday 25th Tevet 5409\n", + " Sunday 25 Tevet 5409\n", " Anno Mundi\n", - " 1648\n", - " 1648\n", + " 1649-01-09\n", + " 1649-01-09\n", " 5409-10-25\n", " day\n", " Legal document\n", @@ -3046,7 +3204,7 @@ " Sunday\n", " \n", " \n", - " 23120\n", + " 23007\n", " 27500\n", " Thursday 4 Sivan 5516\n", " Anno Mundi\n", @@ -3060,7 +3218,7 @@ " Thursday\n", " \n", " \n", - " 23127\n", + " 23014\n", " 27507\n", " Sunday 25 Sivan 5556\n", " Anno Mundi\n", @@ -3074,12 +3232,12 @@ " Sunday\n", " \n", " \n", - " 23131\n", + " 23018\n", " 27511\n", - " Wednesday 28th Tevet 5399\n", + " Wednesday 28 Tevet 5399\n", " Anno Mundi\n", - " 1640\n", - " 1640\n", + " 1639-01-04\n", + " 1639-01-04\n", " 5399-10-28\n", " day\n", " Legal document\n", @@ -3088,12 +3246,12 @@ " Wednesday\n", " \n", " \n", - " 23135\n", + " 23022\n", " 27515\n", - " Monday 15th Iyyar 5414\n", + " Monday 15 Iyyar 5414\n", " Anno Mundi\n", - " 1654\n", - " 1654\n", + " 1654-05-02\n", + " 1654-05-02\n", " 5414-02-15\n", " day\n", " Legal document\n", @@ -3102,7 +3260,7 @@ " Monday\n", " \n", " \n", - " 23136\n", + " 23023\n", " 27516\n", " Thursday 24 Nisan 5481\n", " Anno Mundi\n", @@ -3115,79 +3273,93 @@ " Monday\n", " Thursday\n", " \n", + " \n", + " 23053\n", + " 27546\n", + " Thursday 13th Nisan 5544\n", + " Anno Mundi\n", + " 1784\n", + " 1784\n", + " 5544-01-13\n", + " day\n", + " List or table\n", + " 6\n", + " Sunday\n", + " Thursday\n", + " \n", " \n", "\n", "" ], "text/plain": [ - " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n", - "5271 6947 Monday 3 Iyyar 1740 Seleucid 1429-04-07 \n", - "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid 1127 \n", - "8648 11227 Monday 24 Jumādā I 517 Hijrī 1123-07-20 \n", - "16397 19649 Thursday 26 Iyyar 5306 Anno Mundi 1546-04-28 \n", - "17723 21094 Saturday 20 Rajab 550 Hijrī 1155-09-19 \n", - "23099 27479 Tuesday 11 Tammuz 5525 Anno Mundi 1765-06-30 \n", - "23104 27484 Friday 20th Shevat 5405 Anno Mundi 1645 \n", - "23105 27485 Sunday 22 Adar 5590 Anno Mundi 1830-03-17 \n", - "23107 27487 Thursday 15 Shevat 5450 Anno Mundi 1690-01-25 \n", - "23109 27489 Sunday 6 Nisan 5528 Anno Mundi 1768-03-24 \n", - "23110 27490 Thursday 19th Elul 5428 Anno Mundi 1668 \n", - "23111 27491 Tuesday 1 Kislev 5507 Anno Mundi 1746-11-14 \n", - "23116 27496 Sunday 28 Elul 5511 Anno Mundi 1751-09-18 \n", - "23117 27497 Sunday 17th Sivan 5423 Anno Mundi 1663 \n", - "23118 27498 Sunday 25th Tevet 5409 Anno Mundi 1648 \n", - "23120 27500 Thursday 4 Sivan 5516 Anno Mundi 1756-06-02 \n", - "23127 27507 Sunday 25 Sivan 5556 Anno Mundi 1796-07-01 \n", - "23131 27511 Wednesday 28th Tevet 5399 Anno Mundi 1640 \n", - "23135 27515 Monday 15th Iyyar 5414 Anno Mundi 1654 \n", - "23136 27516 Thursday 24 Nisan 5481 Anno Mundi 1721-04-21 \n", + " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n", + "5255 6947 Monday 3 Iyyar 1740 Seleucid 1429-04-07 \n", + "8624 11227 Monday 24 Jumādā I 517 Hijrī 1123-07-20 \n", + "16299 19649 Thursday 26 Iyyar 5306 Anno Mundi 1546-04-28 \n", + "17622 21094 Saturday 20 Rajab 550 Hijrī 1155-09-19 \n", + "22986 27479 Tuesday 11 Tammuz 5525 Anno Mundi 1765-06-30 \n", + "22991 27484 Friday 20 Shevat 5405 Anno Mundi 1645-02-16 \n", + "22992 27485 Sunday 22 Adar 5590 Anno Mundi 1830-03-17 \n", + "22994 27487 Thursday 15 Shevat 5450 Anno Mundi 1690-01-25 \n", + "22996 27489 Sunday 6 Nisan 5528 Anno Mundi 1768-03-24 \n", + "22997 27490 Thursday 19 Elul 5428 Anno Mundi 1668-08-26 \n", + "22998 27491 Tuesday 1 Kislev 5507 Anno Mundi 1746-11-14 \n", + "23003 27496 Sunday 28 Elul 5511 Anno Mundi 1751-09-18 \n", + "23004 27497 Sunday 17 Sivan 5423 Anno Mundi 1663-06-22 \n", + "23005 27498 Sunday 25 Tevet 5409 Anno Mundi 1649-01-09 \n", + "23007 27500 Thursday 4 Sivan 5516 Anno Mundi 1756-06-02 \n", + "23014 27507 Sunday 25 Sivan 5556 Anno Mundi 1796-07-01 \n", + "23018 27511 Wednesday 28 Tevet 5399 Anno Mundi 1639-01-04 \n", + "23022 27515 Monday 15 Iyyar 5414 Anno Mundi 1654-05-02 \n", + "23023 27516 Thursday 24 Nisan 5481 Anno Mundi 1721-04-21 \n", + "23053 27546 Thursday 13th Nisan 5544 Anno Mundi 1784 \n", "\n", " undate_standard undate_orig orig_date_precision type \\\n", - "5271 1429-04-07 1740-02-03 day Legal document \n", - "5854 1127 1438-10-29 day Legal document \n", - "8648 1123-07-20 0517-05-24 day Paraliterary text \n", - "16397 1546-04-28 5306-02-26 day Legal document \n", - "17723 1155-09-19 0550-07-20 day Legal document \n", - "23099 1765-06-30 5525-04-11 day Legal document \n", - "23104 1645 5405-11-20 day Legal document \n", - "23105 1830-03-17 5590-12-22 day Legal document \n", - "23107 1690-01-25 5450-11-15 day Legal document \n", - "23109 1768-03-24 5528-01-06 day Legal document \n", - "23110 1668 5428-06-19 day Legal document \n", - "23111 1746-11-14 5507-09-01 day Legal document \n", - "23116 1751-09-18 5511-06-28 day Legal document \n", - "23117 1663 5423-03-17 day Legal document \n", - "23118 1648 5409-10-25 day Legal document \n", - "23120 1756-06-02 5516-03-04 day Legal document \n", - "23127 1796-07-01 5556-03-25 day Legal document \n", - "23131 1640 5399-10-28 day Legal document \n", - "23135 1654 5414-02-15 day Legal document \n", - "23136 1721-04-21 5481-01-24 day Legal document \n", + "5255 1429-04-07 1740-02-03 day Legal document \n", + "8624 1123-07-20 0517-05-24 day Paraliterary text \n", + "16299 1546-04-28 5306-02-26 day Legal document \n", + "17622 1155-09-19 0550-07-20 day Legal document \n", + "22986 1765-06-30 5525-04-11 day Legal document \n", + "22991 1645-02-16 5405-11-20 day Legal document \n", + "22992 1830-03-17 5590-12-22 day Legal document \n", + "22994 1690-01-25 5450-11-15 day Legal document \n", + "22996 1768-03-24 5528-01-06 day Legal document \n", + "22997 1668-08-26 5428-06-19 day Legal document \n", + "22998 1746-11-14 5507-09-01 day Legal document \n", + "23003 1751-09-18 5511-06-28 day Legal document \n", + "23004 1663-06-22 5423-03-17 day Legal document \n", + "23005 1649-01-09 5409-10-25 day Legal document \n", + "23007 1756-06-02 5516-03-04 day Legal document \n", + "23014 1796-07-01 5556-03-25 day Legal document \n", + "23018 1639-01-04 5399-10-28 day Legal document \n", + "23022 1654-05-02 5414-02-15 day Legal document \n", + "23023 1721-04-21 5481-01-24 day Legal document \n", + "23053 1784 5544-01-13 day List or table \n", "\n", " undate_weekday undate_weekday_name orig_weekday \n", - "5271 3 Thursday Monday \n", - "5854 4 Friday Tuesday \n", - "8648 4 Friday Monday \n", - "16397 2 Wednesday Thursday \n", - "17723 0 Monday Saturday \n", - "23099 6 Sunday Tuesday \n", - "23104 3 Thursday Friday \n", - "23105 2 Wednesday Sunday \n", - "23107 2 Wednesday Thursday \n", - "23109 3 Thursday Sunday \n", - "23110 6 Sunday Thursday \n", - "23111 0 Monday Tuesday \n", - "23116 5 Saturday Sunday \n", - "23117 4 Friday Sunday \n", - "23118 5 Saturday Sunday \n", - "23120 2 Wednesday Thursday \n", - "23127 4 Friday Sunday \n", - "23131 1 Tuesday Wednesday \n", - "23135 5 Saturday Monday \n", - "23136 0 Monday Thursday " + "5255 3 Thursday Monday \n", + "8624 4 Friday Monday \n", + "16299 2 Wednesday Thursday \n", + "17622 0 Monday Saturday \n", + "22986 6 Sunday Tuesday \n", + "22991 3 Thursday Friday \n", + "22992 2 Wednesday Sunday \n", + "22994 2 Wednesday Thursday \n", + "22996 3 Thursday Sunday \n", + "22997 6 Sunday Thursday \n", + "22998 0 Monday Tuesday \n", + "23003 5 Saturday Sunday \n", + "23004 4 Friday Sunday \n", + "23005 5 Saturday Sunday \n", + "23007 2 Wednesday Thursday \n", + "23014 4 Friday Sunday \n", + "23018 1 Tuesday Wednesday \n", + "23022 5 Saturday Monday \n", + "23023 0 Monday Thursday \n", + "23053 6 Sunday Thursday " ] }, - "execution_count": 30, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -3195,9 +3367,13 @@ "source": [ "matches = weekday_dates[weekday_dates.undate_weekday_name == weekday_dates.orig_weekday]\n", "\n", - "mismatches = weekday_dates[weekday_dates.undate_weekday_name != weekday_dates.orig_weekday]\n", + "mismatches = weekday_dates[\n", + " weekday_dates.undate_weekday_name != weekday_dates.orig_weekday\n", + "]\n", "\n", - "print(f\"{len(matches)} matches, {len(mismatches)} mismatches ({(len(matches)/(len(matches)+len(mismatches)))*100:0.2f}%)\")\n", + "print(\n", + " f\"{len(matches)} matches, {len(mismatches)} mismatches ({(len(matches) / (len(matches) + len(mismatches))) * 100:0.2f}%)\"\n", + ")\n", "mismatches.head(20)" ] }, @@ -3211,7 +3387,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "id": "d6476907-1628-4d68-ab1f-43c95e123707", "metadata": {}, "outputs": [ @@ -3219,13 +3395,13 @@ "data": { "text/plain": [ "doc_date_calendar\n", - "Anno Mundi 55\n", - "Seleucid 3\n", + "Anno Mundi 56\n", + "Seleucid 2\n", "Hijrī 2\n", "Name: count, dtype: int64" ] }, - "execution_count": 31, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -3236,7 +3412,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "id": "18b71d18-5d5b-4f92-8801-499bcf412efe", "metadata": {}, "outputs": [ @@ -3245,16 +3421,16 @@ "text/plain": [ "orig_weekday\n", "Wednesday 17\n", - "Sunday 12\n", + "Sunday 13\n", "Monday 10\n", "Thursday 9\n", - "Tuesday 7\n", + "Tuesday 6\n", "Friday 4\n", "Saturday 1\n", "Name: count, dtype: int64" ] }, - "execution_count": 32, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -3265,7 +3441,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "id": "eb7ea065-e4b5-47aa-9538-8dc9851ea572", "metadata": {}, "outputs": [ @@ -3273,7 +3449,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "1 mismatches that include text 'night'\n" + "0 mismatches that include text 'night'\n" ] }, { @@ -3311,36 +3487,17 @@ " \n", " \n", " \n", - " \n", - " 5854\n", - " 7637\n", - " Monday night, 29 Ṭevet 1438\n", - " Seleucid\n", - " 1127\n", - " 1127\n", - " 1438-10-29\n", - " day\n", - " Legal document\n", - " 4\n", - " Friday\n", - " Tuesday\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n", - "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid 1127 \n", - "\n", - " undate_standard undate_orig orig_date_precision type \\\n", - "5854 1127 1438-10-29 day Legal document \n", - "\n", - " undate_weekday undate_weekday_name orig_weekday \n", - "5854 4 Friday Tuesday " + "Empty DataFrame\n", + "Columns: [pgpid, doc_date_original, doc_date_calendar, doc_date_standard, undate_standard, undate_orig, orig_date_precision, type, undate_weekday, undate_weekday_name, orig_weekday]\n", + "Index: []" ] }, - "execution_count": 33, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -3366,7 +3523,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 35, "id": "ece780b8-2eb2-4cbc-9195-27def665f7fa", "metadata": {}, "outputs": [ @@ -3375,23 +3532,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, - "execution_count": 34, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get numeric weekday\n", - "orig_dates_parsed['undate_weekday'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest.weekday)\n", - "orig_dates_parsed['undate_weekday_name'] = orig_dates_parsed.undate_weekday.apply(lambda x: days[x])\n", + "orig_dates_parsed[\"undate_weekday\"] = orig_dates_parsed.undate_orig.apply(\n", + " lambda x: x.earliest.weekday\n", + ")\n", + "orig_dates_parsed[\"undate_weekday_name\"] = orig_dates_parsed.undate_weekday.apply(\n", + " lambda x: days[x]\n", + ")\n", "\n", "# restrict to dates with day precision; the rest are just using earliest day\n", - "orig_dates_days = orig_dates_parsed[orig_dates_parsed.orig_date_precision == 'day']\n", + "orig_dates_days = orig_dates_parsed[orig_dates_parsed.orig_date_precision == \"day\"]\n", "\n", - "alt.Chart(orig_dates_days[['undate_weekday', 'undate_weekday_name', 'pgpid']]).mark_rect().encode(\n", - " alt.X('undate_weekday_name', sort=days, title='weekday'),\n", - " alt.Color('count(pgpid)', title='# of documents')\n", - ").properties(title='document frequency by weekday')\n" + "alt.Chart(\n", + " orig_dates_days[[\"undate_weekday\", \"undate_weekday_name\", \"pgpid\"]]\n", + ").mark_rect().encode(\n", + " alt.X(\"undate_weekday_name\", sort=days, title=\"weekday\"),\n", + " alt.Color(\"count(pgpid)\", title=\"# of documents\"),\n", + ").properties(title=\"document frequency by weekday\")" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "id": "6b2f24de-18ce-4f40-b300-e8cc334a338c", "metadata": {}, "outputs": [ @@ -3475,17 +3638,17 @@ "data": { "text/plain": [ "undate_weekday_name\n", - "Monday 305\n", - "Thursday 282\n", - "Tuesday 241\n", - "Sunday 229\n", - "Wednesday 229\n", - "Friday 215\n", - "Saturday 98\n", + "Monday 362\n", + "Thursday 337\n", + "Tuesday 303\n", + "Sunday 284\n", + "Wednesday 267\n", + "Friday 265\n", + "Saturday 129\n", "Name: count, dtype: int64" ] }, - "execution_count": 35, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -3496,7 +3659,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 37, "id": "dea83b43-b379-4807-8a33-8e26d7f4f8e7", "metadata": {}, "outputs": [ @@ -3505,23 +3668,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.FacetChart(...)" ] }, - "execution_count": 36, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "weekday_calendar_chart = alt.Chart(weekday_dates[['undate_weekday', 'undate_weekday_name', 'pgpid', 'doc_date_calendar']]).mark_rect().encode(\n", - " alt.X('undate_weekday_name', sort=days, title='weekday'),\n", - " # alt.Y('doc_date_calendar'),\n", - " alt.Color('count(pgpid)')\n", - ").facet(row=alt.Facet('doc_date_calendar', title=\"Original Calendar\")).properties(title='document frequency by weekday and calendar')\n", + "weekday_calendar_chart = (\n", + " alt.Chart(\n", + " weekday_dates[\n", + " [\"undate_weekday\", \"undate_weekday_name\", \"pgpid\", \"doc_date_calendar\"]\n", + " ]\n", + " )\n", + " .mark_rect()\n", + " .encode(\n", + " alt.X(\"undate_weekday_name\", sort=days, title=\"weekday\"),\n", + " # alt.Y('doc_date_calendar'),\n", + " alt.Color(\"count(pgpid)\"),\n", + " )\n", + " .facet(row=alt.Facet(\"doc_date_calendar\", title=\"Original Calendar\"))\n", + " .properties(title=\"document frequency by weekday and calendar\")\n", + ")\n", "weekday_calendar_chart" ] }, @@ -3600,7 +3773,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 38, "id": "cfecdb64-03b4-405b-b1f3-85e876f55680", "metadata": {}, "outputs": [ @@ -3608,13 +3781,13 @@ "data": { "text/plain": [ "doc_date_calendar\n", - "Anno Mundi 82\n", + "Anno Mundi 84\n", "Seleucid 20\n", "Hijrī 2\n", "Name: count, dtype: int64" ] }, - "execution_count": 37, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -3633,7 +3806,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, "id": "e66917b0-2221-42dd-a99b-df847b8e815b", "metadata": {}, "outputs": [ @@ -3642,23 +3815,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.FacetChart(...)" ] }, - "execution_count": 38, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "weekday_calendar_chart.resolve_scale(color='independent')" + "weekday_calendar_chart.resolve_scale(color=\"independent\")" ] }, { @@ -3732,269 +3905,39 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 4, "id": "6a7a0bf5-f8c2-4034-8495-2fb4b297740a", "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pgpiddoc_date_originaldoc_date_calendardoc_date_standardundate_standardundate_origorig_date_precisiontypeundate_weekdayundate_weekday_nameorig_weekdaycentury
8511377Wednesday night, 28 Sivan 1581Seleucid127012701581-03-28dayLegal document3ThursdayThursday1200s
17142418Monday 20 Tevet 1520Seleucid1208-12-291208-12-291520-10-20dayLegal document0MondayMonday1200s
19292649Sunday night, 25 Kislev 1444Seleucid113311331444-09-25dayLegal document0MondayMonday1100s
20132739Wednesday 29th Elul 1354Seleucid1043-09-071043-09-071354-06-29dayLegal document2WednesdayWednesday1000s
32574026Wednesday night, 29 Tishrei 1541Seleucid1229-09-181229-09-181541-07-29dayLegal document3ThursdayThursday1200s
.......................................
2930334623Sunday night, 20 Ṭevet 1578Seleucid1266/12671266/12671578-10-20dayLegal document0MondayMonday1200s
2992435264Wednesday 13 Ṭevet 1526Seleucid1214/12151214/12151526-10-13dayLegal document2WednesdayWednesday1200s
3400839564Monday 16 Tevet 1339Seleucid1027-12-181027-12-181339-10-16dayLegal document0MondayMonday1000s
3446640035Monday 1st Iyyar 1437Seleucid1126-04-261126-04-261437-02-01dayLegal document0MondayMonday1100s
3446740036Friday 15 of Adar 1443Seleucid1132-03-041132-03-041443-12-15dayLegal document4FridayFriday1100s
\n", - "

104 rows × 12 columns

\n", - "
" - ], - "text/plain": [ - " pgpid doc_date_original doc_date_calendar \\\n", - "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n", - "1714 2418 Monday 20 Tevet 1520 Seleucid \n", - "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n", - "2013 2739 Wednesday 29th Elul 1354 Seleucid \n", - "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n", - "... ... ... ... \n", - "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n", - "29924 35264 Wednesday 13 Ṭevet 1526 Seleucid \n", - "34008 39564 Monday 16 Tevet 1339 Seleucid \n", - "34466 40035 Monday 1st Iyyar 1437 Seleucid \n", - "34467 40036 Friday 15 of Adar 1443 Seleucid \n", - "\n", - " doc_date_standard undate_standard undate_orig orig_date_precision \\\n", - "851 1270 1270 1581-03-28 day \n", - "1714 1208-12-29 1208-12-29 1520-10-20 day \n", - "1929 1133 1133 1444-09-25 day \n", - "2013 1043-09-07 1043-09-07 1354-06-29 day \n", - "3257 1229-09-18 1229-09-18 1541-07-29 day \n", - "... ... ... ... ... \n", - "29303 1266/1267 1266/1267 1578-10-20 day \n", - "29924 1214/1215 1214/1215 1526-10-13 day \n", - "34008 1027-12-18 1027-12-18 1339-10-16 day \n", - "34466 1126-04-26 1126-04-26 1437-02-01 day \n", - "34467 1132-03-04 1132-03-04 1443-12-15 day \n", - "\n", - " type undate_weekday undate_weekday_name orig_weekday century \n", - "851 Legal document 3 Thursday Thursday 1200s \n", - "1714 Legal document 0 Monday Monday 1200s \n", - "1929 Legal document 0 Monday Monday 1100s \n", - "2013 Legal document 2 Wednesday Wednesday 1000s \n", - "3257 Legal document 3 Thursday Thursday 1200s \n", - "... ... ... ... ... ... \n", - "29303 Legal document 0 Monday Monday 1200s \n", - "29924 Legal document 2 Wednesday Wednesday 1200s \n", - "34008 Legal document 0 Monday Monday 1000s \n", - "34466 Legal document 0 Monday Monday 1100s \n", - "34467 Legal document 4 Friday Friday 1100s \n", - "\n", - "[104 rows x 12 columns]" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" + "ename": "NameError", + "evalue": "name 'orig_dates_days' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# get rough century (gregorian calendar)\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m weekday_dates[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcentury\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43morig_dates_days\u001b[49m\u001b[38;5;241m.\u001b[39mundate_orig\u001b[38;5;241m.\u001b[39mapply(\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mlambda\u001b[39;00m x: (\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mx\u001b[38;5;241m.\u001b[39mearliest\u001b[38;5;241m.\u001b[39myear\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m04\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)[:\u001b[38;5;241m2\u001b[39m] \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m00s\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 4\u001b[0m )\n\u001b[1;32m 6\u001b[0m weekday_dates[\n\u001b[1;32m 7\u001b[0m [\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpgpid\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 15\u001b[0m ]\n\u001b[1;32m 16\u001b[0m ]\u001b[38;5;241m.\u001b[39mhead()\n\u001b[1;32m 17\u001b[0m weekday_dates\n", + "\u001b[0;31mNameError\u001b[0m: name 'orig_dates_days' is not defined" + ] } ], "source": [ "# get rough century (gregorian calendar)\n", - "weekday_dates['century'] = orig_dates_days.undate_orig.apply(lambda x: (\"%04d\" % x.earliest.year)[:2] + \"00s\")\n", + "weekday_dates[\"century\"] = orig_dates_days.undate_orig.apply(\n", + " lambda x: (f\"{x.earliest.year:04}\")[:2] + \"00s\"\n", + ")\n", "\n", - "weekday_dates[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate_standard', 'undate_orig', 'century']].head()\n", + "weekday_dates[\n", + " [\n", + " \"pgpid\",\n", + " \"doc_date_original\",\n", + " \"doc_date_calendar\",\n", + " \"doc_date_standard\",\n", + " \"undate_standard\",\n", + " \"undate_orig\",\n", + " \"century\",\n", + " ]\n", + "].head()\n", "weekday_dates" ] }, @@ -4086,12 +4029,13 @@ } ], "source": [ - "\n", - "alt.Chart(weekday_dates[['undate_weekday', 'undate_weekday_name', 'pgpid', 'century']]).mark_rect().encode(\n", - " alt.X('undate_weekday_name', sort=days, title='weekday'),\n", - " alt.Y('century'),\n", - " alt.Color('count(pgpid)')\n", - ").properties(title='document frequency by weekday and century')\n" + "alt.Chart(\n", + " weekday_dates[[\"undate_weekday\", \"undate_weekday_name\", \"pgpid\", \"century\"]]\n", + ").mark_rect().encode(\n", + " alt.X(\"undate_weekday_name\", sort=days, title=\"weekday\"),\n", + " alt.Y(\"century\"),\n", + " alt.Color(\"count(pgpid)\"),\n", + ").properties(title=\"document frequency by weekday and century\")" ] }, { @@ -4201,17 +4145,19 @@ "# what about heat map by month?\n", "\n", "# get numeric month\n", - "orig_dates_parsed['undate_month'] = orig_dates_parsed.undate_orig.apply(lambda x: x.month)\n", + "orig_dates_parsed[\"undate_month\"] = orig_dates_parsed.undate_orig.apply(\n", + " lambda x: x.month\n", + ")\n", "# orig_dates_parsed['undate_weekday_name'] = orig_dates_parsed.undate_weekday.apply(lambda x: days[x])\n", "\n", "has_month = orig_dates_parsed[orig_dates_parsed.undate_month.notna()]\n", "\n", - "alt.Chart(has_month[['undate_month', 'pgpid', 'doc_date_calendar']]).mark_rect().encode(\n", - " alt.X('undate_month', title='month'),\n", - " alt.Color('count(pgpid)', title='# of documents')\n", - ").facet(\n", - " row=alt.Facet('doc_date_calendar', title=\"Original Calendar\")\n", - ").properties(title='Document frequency by month and calendar')" + "alt.Chart(has_month[[\"undate_month\", \"pgpid\", \"doc_date_calendar\"]]).mark_rect().encode(\n", + " alt.X(\"undate_month\", title=\"month\"),\n", + " alt.Color(\"count(pgpid)\", title=\"# of documents\"),\n", + ").facet(row=alt.Facet(\"doc_date_calendar\", title=\"Original Calendar\")).properties(\n", + " title=\"Document frequency by month and calendar\"\n", + ")" ] }, { @@ -4370,15 +4316,25 @@ "source": [ "# weekday frequency by month?\n", "\n", - "orig_dates_days['undate_month'] = orig_dates_days.undate_orig.apply(lambda x: x.month)\n", + "orig_dates_days[\"undate_month\"] = orig_dates_days.undate_orig.apply(lambda x: x.month)\n", "\n", - "alt.Chart(orig_dates_days[['undate_weekday', 'undate_weekday_name', 'pgpid', 'undate_month', 'doc_date_calendar']]).mark_rect().encode(\n", - " alt.X('undate_weekday_name', sort=days, title='weekday'),\n", - " alt.Y('undate_month', title=\"month\"),\n", - " alt.Color('count(pgpid)')\n", - ").facet(\n", - " column=alt.Facet('doc_date_calendar', title=\"Original Calendar\")\n", - ").properties(title='Document frequency by weekday and month (1,557 documents)')\n" + "alt.Chart(\n", + " orig_dates_days[\n", + " [\n", + " \"undate_weekday\",\n", + " \"undate_weekday_name\",\n", + " \"pgpid\",\n", + " \"undate_month\",\n", + " \"doc_date_calendar\",\n", + " ]\n", + " ]\n", + ").mark_rect().encode(\n", + " alt.X(\"undate_weekday_name\", sort=days, title=\"weekday\"),\n", + " alt.Y(\"undate_month\", title=\"month\"),\n", + " alt.Color(\"count(pgpid)\"),\n", + ").facet(column=alt.Facet(\"doc_date_calendar\", title=\"Original Calendar\")).properties(\n", + " title=\"Document frequency by weekday and month (1,557 documents)\"\n", + ")" ] } ], diff --git a/examples/shakespeare-and-company-project/shxco_partial_date_durations.ipynb b/examples/shakespeare-and-company-project/shxco_partial_date_durations.ipynb index 38efa6c..f7fb16a 100644 --- a/examples/shakespeare-and-company-project/shxco_partial_date_durations.ipynb +++ b/examples/shakespeare-and-company-project/shxco_partial_date_durations.ipynb @@ -323,19 +323,20 @@ "outputs": [], "source": [ "from undate import UndateInterval\n", - "from undate.date import ONE_DAY\n", "from undate.converters.iso8601 import ISO8601DateFormat\n", + "from undate.date import ONE_DAY\n", + "\n", "\n", "def undate_duration(start_date, end_date):\n", - " isoformat = ISO8601DateFormat()\n", + " isoformat = ISO8601DateFormat()\n", "\n", - " unstart = isoformat.parse(start_date)\n", - " unend = isoformat.parse(end_date)\n", - " interval = UndateInterval(earliest=unstart, latest=unend)\n", + " unstart = isoformat.parse(start_date)\n", + " unend = isoformat.parse(end_date)\n", + " interval = UndateInterval(earliest=unstart, latest=unend)\n", "\n", - " # subtract one here for simplicity of comparison,\n", - " # to reconcile differences between duration logic\n", - " return interval.duration() - ONE_DAY" + " # subtract one here for simplicity of comparison,\n", + " # to reconcile differences between duration logic\n", + " return interval.duration() - ONE_DAY" ] }, { @@ -461,7 +462,15 @@ "# identify subscription events with duration information\n", "subs_duration = events_df[events_df.subscription_duration_days.notna()]\n", "# limit to fields that are relevant for this exploration\n", - "subs_duration = subs_duration[['member_names', 'start_date', 'end_date', 'subscription_duration', 'subscription_duration_days']]\n", + "subs_duration = subs_duration[\n", + " [\n", + " \"member_names\",\n", + " \"start_date\",\n", + " \"end_date\",\n", + " \"subscription_duration\",\n", + " \"subscription_duration_days\",\n", + " ]\n", + "]\n", "subs_duration.head()" ] }, @@ -839,7 +848,9 @@ ], "source": [ "# add a new field for duration as calculated by Undate using the method defined previously\n", - "subs_duration[\"undate_duration\"] = subs_duration.apply(lambda row: undate_duration(str(row.start_date), str(row.end_date)), axis=1)\n", + "subs_duration[\"undate_duration\"] = subs_duration.apply(\n", + " lambda row: undate_duration(str(row.start_date), str(row.end_date)), axis=1\n", + ")\n", "subs_duration.head()" ] }, @@ -1168,7 +1179,10 @@ ], "source": [ "# what's the difference between the two?\n", - "subs_duration['duration_diff'] = subs_duration.apply(lambda row: row.undate_duration.astype(\"int\") - row.subscription_duration_days, axis=1)\n", + "subs_duration[\"duration_diff\"] = subs_duration.apply(\n", + " lambda row: row.undate_duration.astype(\"int\") - row.subscription_duration_days,\n", + " axis=1,\n", + ")\n", "subs_duration" ] }, @@ -1206,7 +1220,7 @@ } ], "source": [ - "subs_duration['duration_diff'].value_counts()" + "subs_duration[\"duration_diff\"].value_counts()" ] }, { @@ -1693,7 +1707,7 @@ ], "source": [ "# lots of one-month subscriptions, what do the discrepancies look like?\n", - "subset_subdurations[subset_subdurations.subscription_duration == '1 month'].head(15)" + "subset_subdurations[subset_subdurations.subscription_duration == \"1 month\"].head(15)" ] }, { @@ -1964,7 +1978,7 @@ ], "source": [ "# durations other than one month\n", - "subset_subdurations[subset_subdurations.subscription_duration != '1 month'].head(15)" + "subset_subdurations[subset_subdurations.subscription_duration != \"1 month\"].head(15)" ] }, { @@ -2076,7 +2090,9 @@ "source": [ "borrow_duration = events_df[events_df.borrow_duration_days.notna()]\n", "# limit to fields we care about for this check\n", - "borrow_duration = borrow_duration[['member_names', 'start_date', 'end_date', 'borrow_duration_days']]\n", + "borrow_duration = borrow_duration[\n", + " [\"member_names\", \"start_date\", \"end_date\", \"borrow_duration_days\"]\n", + "]\n", "borrow_duration.head()" ] }, @@ -2323,7 +2339,9 @@ ], "source": [ "# add a new field for duration as calculated by undate\n", - "borrow_duration[\"undate_duration\"] = borrow_duration.apply(lambda row: undate_duration(str(row.start_date), str(row.end_date)), axis=1)\n", + "borrow_duration[\"undate_duration\"] = borrow_duration.apply(\n", + " lambda row: undate_duration(str(row.start_date), str(row.end_date)), axis=1\n", + ")\n", "borrow_duration.head(10)" ] }, @@ -2496,7 +2514,9 @@ ], "source": [ "# what's the difference between the two?\n", - "borrow_duration['duration_diff'] = borrow_duration.apply(lambda row: row.undate_duration.astype(\"int\") - row.borrow_duration_days, axis=1)\n", + "borrow_duration[\"duration_diff\"] = borrow_duration.apply(\n", + " lambda row: row.undate_duration.astype(\"int\") - row.borrow_duration_days, axis=1\n", + ")\n", "borrow_duration.head(10)" ] }, diff --git a/pyproject.toml b/pyproject.toml index fcebbbd..09c8ca5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,13 +6,13 @@ build-backend = "hatchling.build" name = "undate" description = "library for working with uncertain, fuzzy, or partially unknown dates and date intervals" readme = "README.md" -license = { text = "Apache-2" } +license = { text = "Apache-2.0" } requires-python = ">= 3.10" dynamic = ["version"] dependencies = [ "lark[interegular]", "numpy", - "convertdate", + "convertdate>=2.4,<2.4.1", # changes syntax, deprecation warning "strenum; python_version < '3.11'", ] authors = [ @@ -42,7 +42,6 @@ classifiers = [ "Programming Language :: Python :: 3.13", "Intended Audience :: Developers", "Intended Audience :: Science/Research", - "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Scientific/Engineering", @@ -51,21 +50,20 @@ classifiers = [ ] -[project.optional-dependencies] +[dependency-groups] docs = ["sphinx>=7.0.0", "alabaster", "myst-parser", "myst-parser[linkify]"] -test = ["pytest>=7.2", "pytest-ordering", "pytest-cov"] +test = ["pytest>=9", "pytest-ordering", "pytest-cov"] notebooks = ["jupyterlab", "pandas", "treon", "altair"] -check = ["undate[docs]", "undate[notebooks]", "mypy", "ruff"] +check = [ { include-group = "docs" }, {include-group = "notebooks"}, "mypy", "ruff"] dev = [ "pre-commit>=2.20.0", "twine", "wheel", "build", - "undate[check]", - "undate[docs]", - "undate[test]", + { include-group = "test" }, + { include-group = "check" }, + { include-group = "docs" } ] -all = ["undate", "undate[dev]"] [project.urls] Homepage = "https://github.com/dh-tech/undate-python" @@ -81,8 +79,22 @@ path = "src/undate/__init__.py" [tool.hatch.build.targets.sdist] include = ["src/undate/**/*.py", "src/undate/**/*.lark", "tests/**"] -[tool.pytest.ini_options] -pythonpath = "src/" +[tool.hatch.envs.codegen] +dependencies = ["babel"] + +[tool.hatch.envs.codegen.scripts] +generate = "python scripts/generate_gregorian_grammar.py" + +[tool.pytest] +minversion = "9" +log_level = "INFO" +strict = true +addopts = ["-ra"] +filterwarnings = ["error"] +pythonpath = [ "src/" ] +testpaths = [ + "tests", +] markers = [ "last : run marked tests after all others", "first : run marked tests before all others", @@ -90,3 +102,23 @@ markers = [ [tool.mypy] plugins = ["numpy.typing.mypy_plugin"] + +[tool.ruff.lint] +# Include these rules in addition to ruff's defaults +extend-select = [ + "B", # flake8-bugbear + "C4", # flake8-comprehensions + "I", #isort + "NPY", # numpy-specific rules + "PERF", # perflint + "PTH", # flake8-use-pathlib + "RUF", # ruff-specific rules + "SIM", # flake8-simplify + "UP", # pyupgrade +] +# Can use to ignore specific rules within above selection +ignore = [] + +[tool.ruff.lint.per-file-ignores] +# for test files, don't require docstrings or return type annotations +"tests/**.py" = ["D", "ANN", "RUF"] diff --git a/scripts/generate_gregorian_grammar.py b/scripts/generate_gregorian_grammar.py new file mode 100644 index 0000000..5f821a4 --- /dev/null +++ b/scripts/generate_gregorian_grammar.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python +""" +This script generates the gregorian_multilang.lark file +with month names (full and abbreviated) based on the list of +target languages. + +Run this script with hatch to regenerate the file:: + + hatch run codegen:generate + +""" + +import pathlib +from collections import defaultdict + +from babel.dates import get_month_names + +# lark grammar path relative to this script +GRAMMAR_DIR_PATH = ( + pathlib.Path(__file__).parent.parent / "src" / "undate" / "converters" / "grammars" +) +# file that is generated by this script, in that directory +MONTH_GRAMMAR_FILE = GRAMMAR_DIR_PATH / "gregorian_multilang.lark" + +# include month names in the following languages +languages = [ + "en", # English + "es", # Spanish + "fr", # French + "de", # German + "rw", # Kinyarwanda + "lg", # Ganda + "ti", # Tigrinya +] + +# warning to include at top of generated file +warning_text = """// WARNING: This file is auto-generated. DO NOT EDIT. +// To regenerate: hatch run codegen:generate + +""" + + +def main(): + # create a dictionary of lists to hold the names for each month + all_month_names = defaultdict(list) + + for lang in languages: + for width in ["wide", "abbreviated"]: + for month_num, month_name in get_month_names(width, locale=lang).items(): + # some locales use a . on the shortened month; let's ignore that + month_name = month_name.strip(".").lower() + # In some cases different languages have the same abbreviations; + # in some cases, abbreviated and full are the same. + # Only add if not already present, to avoid redundancy + if month_name not in all_month_names[month_num]: + all_month_names[month_num].append(month_name) + + with MONTH_GRAMMAR_FILE.open("w") as outfile: + outfile.write(warning_text) + + # for each numeric month, generate a rule with all variant names: + # month_1: /January|Jan/i + for i, names in all_month_names.items(): + # combine all names in a case-insensitive OR regex + # sort shortest variants last to avoid partial matches hitting first + or_names = "|".join(sorted(names, key=len, reverse=True)) + outfile.write(f"month_{i}: /({or_names})/i\n") + + print( + f"Successfully regenerated {MONTH_GRAMMAR_FILE.relative_to(pathlib.Path.cwd())}" + ) + print("If the file has changed, make sure to commit the new version.") + + +if __name__ == "__main__": + main() diff --git a/src/undate/__init__.py b/src/undate/__init__.py index 29a2c4a..06082fe 100644 --- a/src/undate/__init__.py +++ b/src/undate/__init__.py @@ -1,14 +1,17 @@ -__version__ = "0.6.1" +__version__ = "0.7.0" +# this sort order is important to avoid circular imports + +# ruff: noqa: I001 from undate.date import DatePrecision, UnDelta -from undate.undate import Undate, Calendar +from undate.undate import Calendar, Undate from undate.interval import UndateInterval __all__ = [ - "Undate", - "UndateInterval", "Calendar", "DatePrecision", "UnDelta", + "Undate", + "UndateInterval", "__version__", ] diff --git a/src/undate/converters/__init__.py b/src/undate/converters/__init__.py index c13f2f1..1024ddb 100644 --- a/src/undate/converters/__init__.py +++ b/src/undate/converters/__init__.py @@ -24,6 +24,6 @@ """ -from undate.converters.base import BaseDateConverter, GRAMMAR_FILE_PATH +from undate.converters.base import GRAMMAR_FILE_PATH, BaseDateConverter -__all__ = ["BaseDateConverter", "GRAMMAR_FILE_PATH"] +__all__ = ["GRAMMAR_FILE_PATH", "BaseDateConverter"] diff --git a/src/undate/converters/base.py b/src/undate/converters/base.py index 93a63a7..3845311 100644 --- a/src/undate/converters/base.py +++ b/src/undate/converters/base.py @@ -47,7 +47,6 @@ import pathlib import pkgutil from functools import cache -from typing import Dict, Type from undate.date import Date @@ -102,12 +101,12 @@ def import_converters(cls) -> int: logger.debug("Loading converters under undate.converters") import undate.converters - # load packages under this path with curent package prefix + # load packages under this path with current package prefix converter_path = undate.converters.__path__ converter_prefix = f"{undate.converters.__name__}." import_count = 0 - for importer, modname, ispkg in pkgutil.iter_modules( + for _importer, modname, _ispkg in pkgutil.iter_modules( converter_path, converter_prefix ): # import everything except the current file @@ -118,14 +117,14 @@ def import_converters(cls) -> int: return import_count @classmethod - def available_converters(cls) -> Dict[str, Type["BaseDateConverter"]]: + def available_converters(cls) -> dict[str, type["BaseDateConverter"]]: """ Dictionary of available converters keyed on name. """ return {c.name: c for c in cls.subclasses()} # type: ignore @classmethod - def subclasses(cls) -> set[Type["BaseDateConverter"]]: + def subclasses(cls) -> set[type["BaseDateConverter"]]: """ Set of available converters classes. Includes descendant subclasses, including calendar converters, but does not include diff --git a/src/undate/converters/calendars/__init__.py b/src/undate/converters/calendars/__init__.py index 5836b2f..f0aa6ff 100644 --- a/src/undate/converters/calendars/__init__.py +++ b/src/undate/converters/calendars/__init__.py @@ -6,6 +6,6 @@ __all__ = [ "GregorianDateConverter", "HebrewDateConverter", - "IslamicDateConverter", + "IslamicDateConverter", "SeleucidDateConverter", ] diff --git a/src/undate/converters/calendars/gregorian/__init__.py b/src/undate/converters/calendars/gregorian/__init__.py new file mode 100644 index 0000000..f08896b --- /dev/null +++ b/src/undate/converters/calendars/gregorian/__init__.py @@ -0,0 +1,3 @@ +from undate.converters.calendars.gregorian.converter import GregorianDateConverter + +__all__ = ["GregorianDateConverter"] diff --git a/src/undate/converters/calendars/gregorian.py b/src/undate/converters/calendars/gregorian/converter.py similarity index 67% rename from src/undate/converters/calendars/gregorian.py rename to src/undate/converters/calendars/gregorian/converter.py index b3b103b..31bf05d 100644 --- a/src/undate/converters/calendars/gregorian.py +++ b/src/undate/converters/calendars/gregorian/converter.py @@ -1,6 +1,11 @@ -from calendar import monthrange, isleap +from calendar import isleap, monthrange + +from lark.exceptions import UnexpectedInput from undate.converters.base import BaseCalendarConverter +from undate.converters.calendars.gregorian.parser import gregorian_parser +from undate.converters.calendars.gregorian.transformer import GregorianDateTransformer +from undate.undate import Undate class GregorianDateConverter(BaseCalendarConverter): @@ -18,6 +23,9 @@ class GregorianDateConverter(BaseCalendarConverter): #: arbitrary known leap year LEAP_YEAR: int = 2024 + def __init__(self): + self.transformer = GregorianDateTransformer() + def min_month(self) -> int: """First month for the Gregorian calendar.""" return 1 @@ -79,3 +87,25 @@ def to_gregorian(self, year, month, day) -> tuple[int, int, int]: a common point of comparison. """ return (year, month, day) + + def parse(self, value: str) -> Undate: + """ + Parse a Gregorian date string of any supported precision in any + supported language and return an :class:`~undate.undate.Undate`. + The input date string is preserved in the label of the resulting + Undate object. + """ + if not value: + raise ValueError("Parsing empty string is not supported") + + # parse the input string, then transform to undate object + try: + # parse the string with our Gregorian date parser + parsetree = gregorian_parser.parse(value) + # transform the parse tree into an undate object + undate_obj = self.transformer.transform(parsetree) + # set the original date string as the label + undate_obj.label = value + return undate_obj + except UnexpectedInput as err: + raise ValueError(f"Could not parse '{value}' as a Gregorian date") from err diff --git a/src/undate/converters/calendars/gregorian/parser.py b/src/undate/converters/calendars/gregorian/parser.py new file mode 100644 index 0000000..cfcea53 --- /dev/null +++ b/src/undate/converters/calendars/gregorian/parser.py @@ -0,0 +1,10 @@ +from lark import Lark + +from undate.converters import GRAMMAR_FILE_PATH + +grammar_path = GRAMMAR_FILE_PATH / "gregorian.lark" + +# open based on filename to allow relative imports based on grammar file +gregorian_parser = Lark.open( + str(grammar_path), rel_to=__file__, start="gregorian_date", strict=True +) diff --git a/src/undate/converters/calendars/gregorian/transformer.py b/src/undate/converters/calendars/gregorian/transformer.py new file mode 100644 index 0000000..5fe4df4 --- /dev/null +++ b/src/undate/converters/calendars/gregorian/transformer.py @@ -0,0 +1,42 @@ +from lark import Transformer, Tree + +from undate import Calendar, Undate + + +class GregorianDateTransformer(Transformer): + """Transform a Gregorian date parse tree and return an Undate.""" + + # Currently parser should not result in intervals + + calendar = Calendar.GREGORIAN + + def gregorian_date(self, items): + parts = {} + for child in items: + if child.data in ["year", "month", "day"]: + # in each case we expect one integer value; + # anonymous tokens convert to their value and cast as int + value = int(child.children[0]) + parts[str(child.data)] = value + + # initialize and return an undate with year, month, day and + # Gregorian calendar + return Undate(**parts, calendar=self.calendar) + + def year(self, items): + # combine multiple parts into a single string + value = "".join([str(i) for i in items]) + return Tree(data="year", children=[value]) + + def month(self, items): + # month has a nested tree for the rule and the value + # the name of the rule (month_1, month_2, etc) gives us the + # number of the month needed for converting the date + tree = items[0] + month_n = tree.data.split("_")[-1] + return Tree(data="month", children=[month_n]) + + def day(self, items): + # combine multiple parts into a single string + value = "".join([str(i) for i in items]) + return Tree(data="day", children=[value]) diff --git a/src/undate/converters/calendars/hebrew/converter.py b/src/undate/converters/calendars/hebrew/converter.py index a8fdfe7..901c8a3 100644 --- a/src/undate/converters/calendars/hebrew/converter.py +++ b/src/undate/converters/calendars/hebrew/converter.py @@ -1,7 +1,5 @@ -from typing import Union - from convertdate import hebrew # type: ignore -from lark.exceptions import UnexpectedCharacters +from lark.exceptions import UnexpectedInput from undate import Undate, UndateInterval from undate.converters.base import BaseCalendarConverter @@ -93,7 +91,7 @@ def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]: """ return hebrew.to_gregorian(year, month, day) - def parse(self, value: str) -> Union[Undate, UndateInterval]: + def parse(self, value: str) -> Undate | UndateInterval: """ Parse a Hebrew date string and return an :class:`~undate.undate.Undate` or :class:`~undate.undate.UndateInterval`. @@ -111,8 +109,8 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]: # set the original date as a label, with the calendar name undate_obj.label = f"{value} {self.calendar_name}" return undate_obj - except UnexpectedCharacters as err: + except UnexpectedInput as err: raise ValueError(f"Could not parse '{value}' as a Hebrew date") from err # do we need to support conversion the other direction? - # i.e., generate a Hebrew date from an abitrary undate or undate interval? + # i.e., generate a Hebrew date from an arbitrary undate or undate interval? diff --git a/src/undate/converters/calendars/hebrew/parser.py b/src/undate/converters/calendars/hebrew/parser.py index 3056f85..074d2c5 100644 --- a/src/undate/converters/calendars/hebrew/parser.py +++ b/src/undate/converters/calendars/hebrew/parser.py @@ -4,6 +4,7 @@ grammar_path = GRAMMAR_FILE_PATH / "hebrew.lark" -with open(grammar_path) as grammar: - # NOTE: LALR parser is faster but can't be used to ambiguity between years and dates - hebrew_parser = Lark(grammar.read(), start="hebrew_date", strict=True) +# open based on filename to allow relative imports based on grammar file +hebrew_parser = Lark.open( + str(grammar_path), rel_to=__file__, start="hebrew_date", strict=True +) diff --git a/src/undate/converters/calendars/hebrew/transformer.py b/src/undate/converters/calendars/hebrew/transformer.py index 1ca8c39..8526df2 100644 --- a/src/undate/converters/calendars/hebrew/transformer.py +++ b/src/undate/converters/calendars/hebrew/transformer.py @@ -1,10 +1,10 @@ from lark import Transformer, Tree -from undate import Undate, Calendar +from undate import Calendar, Undate class HebrewUndate(Undate): - """Undate convience subclass; sets default calendar to Hebrew.""" + """Undate convenience subclass; sets default calendar to Hebrew.""" calendar = Calendar.HEBREW diff --git a/src/undate/converters/calendars/islamic/converter.py b/src/undate/converters/calendars/islamic/converter.py index 67f2a64..f0962fc 100644 --- a/src/undate/converters/calendars/islamic/converter.py +++ b/src/undate/converters/calendars/islamic/converter.py @@ -1,7 +1,5 @@ -from typing import Union - from convertdate import islamic # type: ignore -from lark.exceptions import UnexpectedCharacters +from lark.exceptions import UnexpectedInput from undate import Undate, UndateInterval from undate.converters.base import BaseCalendarConverter @@ -79,7 +77,7 @@ def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]: # NOTE: this results in weird numbers for months when year gets sufficiently high return islamic.to_gregorian(year, month, day) - def parse(self, value: str) -> Union[Undate, UndateInterval]: + def parse(self, value: str) -> Undate | UndateInterval: """ Parse an Islamic/Hijri date string and return an :class:`~undate.undate.Undate` or :class:`~undate.undate.UndateInterval`. @@ -97,7 +95,7 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]: # set the original date as a label, with the calendar name undate_obj.label = f"{value} {self.calendar_name}" return undate_obj - except UnexpectedCharacters as err: + except UnexpectedInput as err: raise ValueError(f"Could not parse '{value}' as an Islamic date") from err # do we need to support conversion the other direction? diff --git a/src/undate/converters/calendars/islamic/parser.py b/src/undate/converters/calendars/islamic/parser.py index 61a0cf0..d753a7a 100644 --- a/src/undate/converters/calendars/islamic/parser.py +++ b/src/undate/converters/calendars/islamic/parser.py @@ -4,6 +4,7 @@ grammar_path = GRAMMAR_FILE_PATH / "islamic.lark" -with open(grammar_path) as grammar: - # NOTE: LALR parser is faster but can't be used due to ambiguity between years and days - islamic_parser = Lark(grammar.read(), start="islamic_date", strict=True) +# open based on filename to allow relative imports based on grammar file +islamic_parser = Lark.open( + str(grammar_path), rel_to=__file__, start="islamic_date", strict=True +) diff --git a/src/undate/converters/calendars/islamic/transformer.py b/src/undate/converters/calendars/islamic/transformer.py index 7310d86..0f9e48c 100644 --- a/src/undate/converters/calendars/islamic/transformer.py +++ b/src/undate/converters/calendars/islamic/transformer.py @@ -1,10 +1,10 @@ from lark import Transformer, Tree -from undate import Undate, Calendar +from undate import Calendar, Undate class IslamicUndate(Undate): - """Undate convience subclass; sets default calendar to Islamic.""" + """Undate convenience subclass; sets default calendar to Islamic.""" calendar = Calendar.ISLAMIC diff --git a/src/undate/converters/combined.py b/src/undate/converters/combined.py index 54d66a5..3cc9ae9 100644 --- a/src/undate/converters/combined.py +++ b/src/undate/converters/combined.py @@ -1,20 +1,20 @@ """ -**Experimental** combined parser. Supports EDTF, Hebrew, and Hijri -where dates are unambiguous. (Year-only dates are parsed as EDTF in -Gregorian calendar.) +Combined parser. Supports EDTF, Gregorian, Hebrew, Hijri, and Christian +liturgical dates where dates are unambiguous. Year-only dates are parsed +as EDTF in Gregorian calendar. """ -from typing import Union - from lark import Lark -from lark.exceptions import UnexpectedCharacters +from lark.exceptions import UnexpectedInput from lark.visitors import Transformer, merge_transformers from undate import Undate, UndateInterval -from undate.converters import BaseDateConverter, GRAMMAR_FILE_PATH -from undate.converters.edtf.transformer import EDTFTransformer +from undate.converters import GRAMMAR_FILE_PATH, BaseDateConverter +from undate.converters.calendars.gregorian.transformer import GregorianDateTransformer from undate.converters.calendars.hebrew.transformer import HebrewDateTransformer from undate.converters.calendars.islamic.transformer import IslamicDateTransformer +from undate.converters.edtf.transformer import EDTFTransformer +from undate.converters.holidays import HolidayTransformer class CombinedDateTransformer(Transformer): @@ -33,6 +33,8 @@ def start(self, children): edtf=EDTFTransformer(), hebrew=HebrewDateTransformer(), islamic=IslamicDateTransformer(), + gregorian=GregorianDateTransformer(), + holidays=HolidayTransformer(), ) @@ -45,14 +47,16 @@ def start(self, children): class OmnibusDateConverter(BaseDateConverter): """ Combination parser that aggregates existing parser grammars. - Currently supports EDTF, Hebrew, and Hijri where dates are unambiguous. - (Year-only dates are parsed as EDTF in Gregorian calendar.) + Supports EDTF, Gregorian, Hebrew, Hijri, and Christian liturgical dates + where dates are unambiguous. Year-only dates are parsed as EDTF in + Gregorian calendar. Does not support serialization. Example usage:: - Undate.parse("Tammuz 4816", "omnibus") + Undate.parse("Tammuz 4812", "omnibus") + Undate.parse("Easter 1916", "omnibus") """ @@ -62,7 +66,7 @@ class OmnibusDateConverter(BaseDateConverter): def __init__(self): self.transformer = combined_transformer - def parse(self, value: str) -> Union[Undate, UndateInterval]: + def parse(self, value: str) -> Undate | UndateInterval: """ Parse a string in a supported format and return an :class:`~undate.undate.Undate` or :class:`~undate.undate.UndateInterval`. @@ -75,11 +79,11 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]: parsetree = parser.parse(value) # transform returns a list; we want the first item in the list return self.transformer.transform(parsetree)[0] - except UnexpectedCharacters: + except UnexpectedInput as err: raise ValueError( - "Parsing failed: '%s' is not in a recognized date format" % value - ) + f"Parsing failed: '{value}' is not in a recognized date format" + ) from err - def to_string(self, undate: Union[Undate, UndateInterval]) -> str: + def to_string(self, undate: Undate | UndateInterval) -> str: "Not supported by this converter. Will raise :class:`ValueError`" raise ValueError("Omnibus converter does not support serialization") diff --git a/src/undate/converters/edtf/converter.py b/src/undate/converters/edtf/converter.py index d0b742f..d9804d6 100644 --- a/src/undate/converters/edtf/converter.py +++ b/src/undate/converters/edtf/converter.py @@ -1,6 +1,4 @@ -from typing import Optional, Union - -from lark.exceptions import UnexpectedCharacters +from lark.exceptions import UnexpectedInput from undate import Undate, UndateInterval from undate.converters.base import BaseDateConverter @@ -8,7 +6,6 @@ from undate.converters.edtf.transformer import EDTFTransformer from undate.date import DatePrecision - #: character for unspecified digits EDTF_UNSPECIFIED_DIGIT: str = "X" @@ -27,7 +24,7 @@ class EDTFDateConverter(BaseDateConverter): def __init__(self): self.transformer = EDTFTransformer() - def parse(self, value: str) -> Union[Undate, UndateInterval]: + def parse(self, value: str) -> Undate | UndateInterval: """ Parse a string in a supported EDTF date or date interval format and return an :class:`~undate.undate.Undate` or @@ -40,19 +37,19 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]: try: parsetree = edtf_parser.parse(value) return self.transformer.transform(parsetree) - except UnexpectedCharacters: + except UnexpectedInput as err: raise ValueError( - "Parsing failed: '%s' is not a supported EDTF date format" % value - ) + f"Parsing failed: '{value}' is not a supported EDTF date format" + ) from err def _convert_missing_digits( - self, value: Optional[str], old_missing_digit: str - ) -> Optional[str]: + self, value: str | None, old_missing_digit: str + ) -> str | None: if value: return value.replace(old_missing_digit, EDTF_UNSPECIFIED_DIGIT) return None - def to_string(self, undate: Union[Undate, UndateInterval]) -> str: + def to_string(self, undate: Undate | UndateInterval) -> str: """ Convert an :class:`~undate.undate.Undate` or :class:`~undate.undate.UndateInterval` to EDTF format. diff --git a/src/undate/converters/edtf/parser.py b/src/undate/converters/edtf/parser.py index bc8f0ef..4e1bda0 100644 --- a/src/undate/converters/edtf/parser.py +++ b/src/undate/converters/edtf/parser.py @@ -4,5 +4,5 @@ grammar_path = GRAMMAR_FILE_PATH / "edtf.lark" -with open(grammar_path) as grammar: +with grammar_path.open() as grammar: edtf_parser = Lark(grammar.read(), start="edtf") diff --git a/src/undate/converters/grammars/combined.lark b/src/undate/converters/grammars/combined.lark index 0e77b5c..72fbf97 100644 --- a/src/undate/converters/grammars/combined.lark +++ b/src/undate/converters/grammars/combined.lark @@ -1,7 +1,11 @@ %import common.WS %ignore WS -start: (edtf__start | hebrew__hebrew_date | islamic__islamic_date ) +// Ignore periods and commas in dates +%import .undate_common.DATE_PUNCTUATION +%ignore DATE_PUNCTUATION + +start: (edtf__start | hebrew__hebrew_date | islamic__islamic_date | gregorian__gregorian_date | holidays__holiday_date) // Renaming of the import variables is required, as they receive the namespace of this file. // See: https://github.com/lark-parser/lark/pull/973#issuecomment-907287565 @@ -23,10 +27,19 @@ start: (edtf__start | hebrew__hebrew_date | islamic__islamic_date ) %import .islamic.month -> islamic__month %import .islamic.year -> islamic__year +// gregorian calendar, in multiple languages +%import .gregorian.gregorian_date -> gregorian__gregorian_date + +// relative import from holidays.lark +%import .holidays.holiday_date -> holidays__holiday_date // override hebrew date to omit year-only, since year without calendar is ambiguous // NOTE: potentially support year with calendar label -%override hebrew__hebrew_date: hebrew__day hebrew__month hebrew__year | hebrew__month hebrew__year +%override hebrew__hebrew_date: hebrew__day hebrew__month hebrew__year | hebrew__month hebrew__year // same for islamic date, year alone is ambiguous -%override islamic__islamic_date: islamic__day islamic__month islamic__year | islamic__month islamic__year +%override islamic__islamic_date: islamic__day islamic__month islamic__year | islamic__month islamic__year + +// same as above. omit year only, since covered by EDTF +// %override gregorian__gregorian_date: day month year | month day year | year month day | month year | year month | day month | month day + diff --git a/src/undate/converters/grammars/gregorian.lark b/src/undate/converters/grammars/gregorian.lark new file mode 100644 index 0000000..93338f9 --- /dev/null +++ b/src/undate/converters/grammars/gregorian.lark @@ -0,0 +1,37 @@ +%import common.WS +%ignore WS + +// Ignore periods and commas in dates +%import .undate_common.DATE_PUNCTUATION +%ignore DATE_PUNCTUATION + +%import .gregorian_multilang (month_1, month_2, month_3, month_4, month_5, \ + month_6, month_7, month_8, month_9, month_10, month_11, month_12) + + +// no weekday support for now +gregorian_date: day month year | month day year | year month day | month year | year month | year | day month | month day + +// months have 28 to 31 days; we do not expect leading zeroes +day: /[1-9]/ | /[12][0-9]/ | /3[0-1]/ + +// Gregorian calendar started in 1582; assume years with 3 or more digits for now, +// so we can support mixed day / year order unambiguously +year: /\b\d{3,}\b/ +// Use word boundaries to separate from other tokens (esp. numeric day), +// since we otherwise ignore whitespace + +// months +month: month_1 + | month_2 + | month_3 + | month_4 + | month_5 + | month_6 + | month_7 + | month_8 + | month_9 + | month_10 + | month_11 + | month_12 + diff --git a/src/undate/converters/grammars/gregorian_multilang.lark b/src/undate/converters/grammars/gregorian_multilang.lark new file mode 100644 index 0000000..5cd2927 --- /dev/null +++ b/src/undate/converters/grammars/gregorian_multilang.lark @@ -0,0 +1,15 @@ +// WARNING: This file is auto-generated. DO NOT EDIT. +// To regenerate: hatch run codegen:generate + +month_1: /(janwaliyo|mutarama|january|janvier|januar|enero|janv|jan|ene|mut|ጥሪ)/i +month_2: /(gashyantare|febwaliyo|february|febrero|février|februar|févr|ለካቲት|feb|gas|ለካ)/i +month_3: /(werurwe|marisi|march|marzo|mars|märz|መጋቢት|mar|wer|መጋ)/i +month_4: /(april|abril|avril|apuli|mata|ሚያዝያ|apr|abr|avr|mat|apu|ሚያ)/i +month_5: /(gicurasi|maayi|mayo|ጉንበት|may|mai|gic|maa|ግን)/i +month_6: /(kamena|junio|juuni|june|juin|juni|jun|kam|juu|ሰነ)/i +month_7: /(nyakanga|juillet|julaayi|julio|july|juil|juli|jul|nya|ሓምለ|ሓም)/i +month_8: /(agusito|august|agosto|kanama|août|aug|ago|kan|agu|ነሓሰ|ነሓ)/i +month_9: /(septiembre|sebuttemba|september|septembre|nzeri|መስከረም|sept|sep|nze|seb|መስ)/i +month_10: /(ukwakira|okitobba|october|octubre|octobre|oktober|ጥቅምቲ|oct|okt|ukw|oki|ጥቅ)/i +month_11: /(ugushyingo|noviembre|november|novembre|novemba|nov|ugu|ሕዳር|ሕዳ)/i +month_12: /(diciembre|december|décembre|dezember|ukuboza|desemba|ታሕሳስ|dec|dic|déc|dez|uku|des|ታሕ)/i diff --git a/src/undate/converters/grammars/hebrew.lark b/src/undate/converters/grammars/hebrew.lark index 118ed98..1b28d19 100644 --- a/src/undate/converters/grammars/hebrew.lark +++ b/src/undate/converters/grammars/hebrew.lark @@ -1,9 +1,13 @@ %import common.WS %ignore WS +// Ignore periods and commas in dates +%import .undate_common.DATE_PUNCTUATION +%ignore DATE_PUNCTUATION + // only support day month year format for now // parser requires numeric day and year to be distinguished based on order -hebrew_date: weekday? day month comma? year | month year | year +hebrew_date: weekday? day month year | month year | year // TODO: handle date ranges? @@ -31,8 +35,7 @@ month: month_1 // months have 29 or 30 days; we do not expect leading zeroes day: /[1-9]/ | /[12][0-9]/ | /30/ -comma: "," -weekday: ("Monday" | "Tuesday" | "Wednesday" | "Thursday" | "Friday" | "Saturday" | "Sunday") comma? +weekday: ("Monday" | "Tuesday" | "Wednesday" | "Thursday" | "Friday" | "Saturday" | "Sunday") // months, in order; from convertdate list diff --git a/src/undate/converters/grammars/holidays.lark b/src/undate/converters/grammars/holidays.lark new file mode 100644 index 0000000..7b29d64 --- /dev/null +++ b/src/undate/converters/grammars/holidays.lark @@ -0,0 +1,37 @@ +%import common.WS +%ignore WS + +%import .undate_common.DATE_PUNCTUATION +%ignore DATE_PUNCTUATION + + +holiday_date: movable_feast year | fixed_date year? + +// holidays that shift depending on the year +movable_feast: EASTER | EASTER_MONDAY | HOLY_SATURDAY | ASCENSION + | PENTECOST | WHIT_MONDAY | TRINITY | ASH_WEDNESDAY | SHROVE_TUESDAY + +// holidays that are always on the same date +fixed_date: EPIPHANY | CANDLEMAS | ST_PATRICKS | ALL_FOOLS | ST_CYPRIANS + +year: /\d{4}/ + +// all patterns use case-insensitive regex + +// Fixed-date holidays +EPIPHANY: /epiphany/i +CANDLEMAS: /candlemass?/i // recognize with both one and 2 s +ST_PATRICKS: /st\.?\s*patrick'?s?\s*day/i +ALL_FOOLS: /(april|all)\s*fools?\s*day/i +ST_CYPRIANS: /st\.?\s*cyprian'?s?\s*day/i + +// Moveable feasts +EASTER: /easter/i +EASTER_MONDAY: /easter\s*monday/i +HOLY_SATURDAY: /holy\s*saturday/i +ASCENSION: /ascension\s*day|ascension/i +PENTECOST: /pentecost/i +WHIT_MONDAY: /whit\s*monday|whitsun\s*monday/i +TRINITY: /trinity\s*sunday|trinity/i +ASH_WEDNESDAY: /ash\s*wednesday/i +SHROVE_TUESDAY: /shrove\s*tuesday/i diff --git a/src/undate/converters/grammars/islamic.lark b/src/undate/converters/grammars/islamic.lark index 1e4940b..530116a 100644 --- a/src/undate/converters/grammars/islamic.lark +++ b/src/undate/converters/grammars/islamic.lark @@ -1,6 +1,10 @@ %import common.WS %ignore WS +// Ignore periods and commas in dates +%import .undate_common.DATE_PUNCTUATION +%ignore DATE_PUNCTUATION + // only support day month year format for now // parser requires numeric day and year to be distinguished based on order islamic_date: weekday? day month year | month year | year diff --git a/src/undate/converters/grammars/undate_common.lark b/src/undate/converters/grammars/undate_common.lark new file mode 100644 index 0000000..ac42b47 --- /dev/null +++ b/src/undate/converters/grammars/undate_common.lark @@ -0,0 +1,3 @@ +// Some abbreviations use periods; some default date formats +// include commas. Ignore both +DATE_PUNCTUATION: "." | "," diff --git a/src/undate/converters/holidays.py b/src/undate/converters/holidays.py new file mode 100644 index 0000000..72c890a --- /dev/null +++ b/src/undate/converters/holidays.py @@ -0,0 +1,166 @@ +""" +Holiday date Converter: parse Christian liturgical dates and convert to Gregorian. +""" + +import datetime + +from convertdate import holidays # type: ignore[import-untyped] +from lark import Lark, Token, Transformer, Tree +from lark.exceptions import UnexpectedInput + +from undate import Calendar, Undate +from undate.converters.base import GRAMMAR_FILE_PATH, BaseDateConverter + +# To add a new holiday: +# 1. Add a name and pattern to holidays.lark grammar file +# 2. Include the in appropriate section (fixed or movable) +# 3. Add an entry to FIXED_HOLIDAYS or MOVABLE_FEASTS; must match grammar terminal name + + +# holidays that fall on the same date every year +# key must match grammar term; value is tuple of numeric month, day +FIXED_HOLIDAYS = { + "EPIPHANY": (1, 6), # January 6 + "CANDLEMAS": (2, 2), # February 2; 40th day & end of epiphany + "ST_PATRICKS": (3, 17), # March 17 + "ALL_FOOLS": (4, 1), # All / April fools day, April 1 + "ST_CYPRIANS": (9, 16), # St. Cyprian's Feast day: September 16 +} + +# holidays that shift depending on the year; value is days relative to Easter +MOVABLE_FEASTS = { + "EASTER": 0, # Easter, no offset + "HOLY_SATURDAY": -1, # day before Easter + "EASTER_MONDAY": 1, # day after Easter + "ASCENSION": 39, # fortieth day of Easter + "PENTECOST": 49, # 7 weeks after Easter + "WHIT_MONDAY": 50, # Monday after Pentecost + "TRINITY": 56, # first Sunday after Pentecost + "ASH_WEDNESDAY": -46, # Wednesday of the 7th week before Easter + "SHROVE_TUESDAY": -47, # day before Ash Wednesday +} + + +parser = Lark.open( + str(GRAMMAR_FILE_PATH / "holidays.lark"), rel_to=__file__, start="holiday_date" +) + + +class HolidayTransformer(Transformer): + calendar = Calendar.GREGORIAN + + def year(self, items): + value = "".join([str(i) for i in items]) + return Token("year", value) + # return Tree(data="year", children=[value]) + + def movable_feast(self, items): + # movable feast day can't be calculated without the year, + # so pass through + return items[0] + + def fixed_date(self, items): + item = items[0] + # type is prefixed when included in the combined parser; + # we need the second portion + holiday_name = item.type.split("__")[-1] + try: + month, day = FIXED_HOLIDAYS[holiday_name] + except KeyError as err: + raise ValueError(f"Unknown fixed holiday {holiday_name}") from err + return Tree("fixed_date", [Token("month", month), Token("day", day)]) + + def holiday_date(self, items): + parts = self._get_date_parts(items) + return Undate(**parts) + + def _get_date_parts(self, items) -> dict[str, int | str]: + # recursive method to take parsed tokens and trees and generate + # a dictionary of year, month, day for initializing an undate object + # handles nested tree with month/day (for fixed date holidays) + # and includes movable feast logic, after year is determined. + + parts = {} + date_parts = ["year", "month", "day"] + movable_feast = None + for child in items: + field = value = None + # if this is a token, get type and value + if isinstance(child, Token): + # month/day from fixed date holiday + if child.type in date_parts: + field = child.type + value = child.value + # check for movable feast terminal + elif child.type in MOVABLE_FEASTS: + # collect but don't handle until we know the year + movable_feast = child.type + # handle namespaced token type; happens when called from combined grammar + elif ( + "__" in child.type and child.type.split("__")[-1] in MOVABLE_FEASTS + ): + # collect but don't handle until we know the year + movable_feast = child.type.split("__")[-1] + + # if a tree, recurse on children to get date parts + if isinstance(child, Tree) and child.children: + parts.update(self._get_date_parts(child.children)) + + # if date fields were found, add to dictionary + if field and value: + # currently all date parts are integer only + parts[str(field)] = int(value) + + # if date is a movable feast, calculate relative to Easter based on the year + if movable_feast is not None: + try: + year = parts["year"] + except KeyError as err: + raise ValueError("Year is required for movable feasts") from err + offset = MOVABLE_FEASTS[movable_feast] + + holiday_date = datetime.date(*holidays.easter(year)) + datetime.timedelta( + days=offset + ) + parts.update({"month": holiday_date.month, "day": holiday_date.day}) + + return parts + + +class HolidayDateConverter(BaseDateConverter): + """ + Converter for Christian liturgical dates. + + Supports fixed-date holidays (Epiphany, Candlemass, etc.) and + Easter-relative movable feasts (Easter, Ash Wednesday, Pentecost, etc.). + + Example usage:: + + Undate.parse("Easter 1942", "holidays") + Undate.parse("Ash Wednesday 1942", "holidays") + Undate.parse("Epiphany", "holidays") + + Does not support serialization. + """ + + name = "holidays" + + def __init__(self): + self.transformer = HolidayTransformer() + + def parse(self, value: str) -> Undate: + if not value: + raise ValueError("Parsing empty string is not supported") + + try: + parsetree = parser.parse(value) + # transform the parse tree into an undate or undate interval + undate_obj = self.transformer.transform(parsetree) + # set the input holiday text as a label on the undate object + undate_obj.label = value + return undate_obj + except UnexpectedInput as err: + raise ValueError(f"Could not parse '{value}' as a holiday date") from err + + def to_string(self, undate: Undate) -> str: + raise ValueError("Holiday converter does not support serialization") diff --git a/src/undate/converters/iso8601.py b/src/undate/converters/iso8601.py index 4f05b69..419d8f6 100644 --- a/src/undate/converters/iso8601.py +++ b/src/undate/converters/iso8601.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Union +from typing import ClassVar from undate import Undate, UndateInterval from undate.converters.base import BaseDateConverter @@ -13,13 +13,13 @@ class ISO8601DateFormat(BaseDateConverter): # do not change; Undate relies on this string #: datetime strftime format for known part of date - iso_format: Dict[str, str] = { + iso_format: ClassVar[dict[str, str]] = { "year": "%Y", "month": "%m", "day": "%d", } - def parse(self, value: str) -> Union[Undate, UndateInterval]: + def parse(self, value: str) -> Undate | UndateInterval: """ Parse an ISO88601 string and return an :class:`~undate.undate.Undate` or :class:`~undate.undate.UndateInterval`. Currently supports @@ -29,7 +29,7 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]: # TODO: what happens if someone gives us a full isoformat date with time? # (ignore, error?) # TODO: what about invalid format? - parts: List[str] = value.split("/") # split in case we have a range + parts: list[str] = value.split("/") # split in case we have a range if len(parts) == 1: return self._parse_single_date(parts[0]) elif len(parts) == 2: @@ -43,7 +43,7 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]: def _parse_single_date(self, value: str) -> Undate: # split single iso date into parts; convert to int or None # special case: missing year - date_parts: List[Union[int, None]] = [] + date_parts: list[int | None] = [] if value.startswith("--"): date_parts.append(None) # year unknown value = value[2:] @@ -53,7 +53,7 @@ def _parse_single_date(self, value: str) -> Undate: # Argument of type "int | None" cannot be assigned to parameter "formatter" of type "BaseDateFormat | None" in function "__init__" return Undate(*date_parts) # type: ignore - def to_string(self, undate: Union[Undate, UndateInterval]) -> str: + def to_string(self, undate: Undate | UndateInterval) -> str: """ Convert an :class:`~undate.undate.Undate` or :class:`~undate.undate.UndateInterval` to ISO8601 string format. @@ -70,13 +70,13 @@ def to_string(self, undate: Union[Undate, UndateInterval]) -> str: def _undate_to_string(self, undate: Undate) -> str: # serialize to iso format for simplicity, for now - date_parts: List[Union[str, None]] = [] + date_parts: list[str | None] = [] # for each part of the date that is known, generate the string format # then combine # TODO: should error if we have year and day but no month # TODO: may want to refactor and take advantage of the year/month/day properties # added for use in EDTF formatter code - for date_portion, iso_format in self.iso_format.items(): + for date_portion in self.iso_format: # is known means fully known, means guaranteed integer if undate.is_known(date_portion): # NOTE: datetime strftime for %Y for 3-digit year @@ -84,26 +84,26 @@ def _undate_to_string(self, undate: Undate) -> str: # and not others; force year to always be 4 digits if date_portion == "year" and undate.year: try: - date_parts.append("%04d" % int(undate.year)) + date_parts.append(f"{undate.year:04}") except ValueError: # shouldn't happen because of is_known date_parts.append(undate.year) elif date_portion == "month" and undate.month: try: - date_parts.append("%02d" % int(undate.month)) + date_parts.append(f"{undate.month:02}") except ValueError: # shouldn't happen because of is_known date_parts.append(undate.month) elif date_portion == "day" and undate.day: try: - date_parts.append("%02d" % int(undate.day)) + date_parts.append(f"{undate.day:02}") except ValueError: # shouldn't happen because of is_known date_parts.append(undate.day) elif date_portion == "year": # if year is not known, add '-' for year portion, - # to genereate --MM-DD unknown year format + # to generate --MM-DD unknown year format date_parts.append("-") # TODO: fix type error: "list[str | None]" is incompatible with "Iterable[str]" return "-".join(date_parts) # type: ignore diff --git a/src/undate/date.py b/src/undate/date.py index 44f79fa..9ef3da3 100644 --- a/src/undate/date.py +++ b/src/undate/date.py @@ -1,10 +1,9 @@ -from enum import IntEnum -from dataclasses import dataclass, replace import operator +from collections.abc import Iterable +from dataclasses import dataclass, replace +from enum import IntEnum # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None -from typing import Optional, Union, Iterable - import numpy as np @@ -12,7 +11,7 @@ class Timedelta(np.ndarray): """Convenience class to make :class:`numpy.timedelta64` act more like the built-in python :class:`datetime.timedelta`.""" - def __new__(cls, deltadays: Union[np.timedelta64, int]): + def __new__(cls, deltadays: np.timedelta64 | int): if isinstance(deltadays, int): deltadays = np.timedelta64(deltadays, "D") data = np.asarray(deltadays, dtype="timedelta64") @@ -186,9 +185,9 @@ class Date(np.ndarray): def __new__( cls, - year: Union[int, np.datetime64], - month: Optional[int] = None, - day: Optional[int] = None, + year: int | np.datetime64, + month: int | None = None, + day: int | None = None, ): if isinstance(year, np.datetime64): _data = year @@ -231,21 +230,21 @@ def year(self) -> int: return int(str(self.astype("datetime64[Y]"))) @property - def month(self) -> Optional[int]: + def month(self) -> int | None: # if date unit is year, don't return a month (only M/D) if self.dtype != "datetime64[Y]": return int(str(self.astype("datetime64[M]")).split("-")[-1]) return None @property - def day(self) -> Optional[int]: + def day(self) -> int | None: # only return a day if date unit is in days if self.dtype == "datetime64[D]": return int(str(self.astype("datetime64[D]")).split("-")[-1]) return None @property - def weekday(self) -> Optional[int]: + def weekday(self) -> int | None: """Equivalent to :meth:`datetime.date.weekday`; returns day of week as an integer where Monday is 0 and Sunday is 6. Only supported for dates with date unit in days. @@ -261,7 +260,7 @@ def weekday(self) -> Optional[int]: thursday_week = self.astype("datetime64[W]") days_from_thursday = (self - thursday_week).astype(int) # if monday is 0, thursday is 3 - return (days_from_thursday + 3) % 7 + return int((days_from_thursday + 3) % 7) return None @@ -280,12 +279,24 @@ def __sub__(self, other): # NOTE: add should not be subclassed because we want to return a Date, not a delta +class Weekday(IntEnum): + """Weekday as an integer, compatible with :meth:`datetime.date.weekday`.""" + + MONDAY = 0 + TUESDAY = 1 + WEDNESDAY = 2 + THURSDAY = 3 + FRIDAY = 4 + SATURDAY = 5 + SUNDAY = 6 + + class DatePrecision(IntEnum): """date precision, to indicate date precision independent from how much of the date is known.""" # NOTE: values MUST be ordered based on the relative size or - # precison of the time unit. That is, the smaller the unit, the more precise + # precision of the time unit. That is, the smaller the unit, the more precise # it is: a day is more precise than a month, a month is more precise than a year, # (DatePrecision.year < DatePrecision.month) @@ -305,4 +316,4 @@ def __str__(self): return f"{self.name}" # NOTE: consider harmonizing / using numpy date units: - # years (‘Y’), months (‘M’), weeks (‘W’), and days (‘D’) + # years (Y), months (M), weeks (W), and days (D) diff --git a/src/undate/interval.py b/src/undate/interval.py index a7fbe55..774cc89 100644 --- a/src/undate/interval.py +++ b/src/undate/interval.py @@ -1,10 +1,9 @@ # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None -from typing import Optional, Union - +from typing import Optional from undate import Undate -from undate.date import ONE_DAY, ONE_YEAR, Timedelta from undate.converters.base import BaseDateConverter +from undate.date import ONE_DAY, ONE_YEAR, Timedelta class UndateInterval: @@ -19,18 +18,18 @@ class UndateInterval: """ # date range between two undates - earliest: Union[Undate, None] - latest: Union[Undate, None] - label: Union[str, None] + earliest: Undate | None + latest: Undate | None + label: str | None # TODO: think about adding an optional precision / length /size field # using DatePrecision for intervals of any standard duration (decade, century) def __init__( self, - earliest: Optional[Undate] = None, - latest: Optional[Undate] = None, - label: Optional[str] = None, + earliest: Undate | None = None, + latest: Undate | None = None, + label: str | None = None, ): # takes two undate objects; allows conversion from supported types if earliest: @@ -58,7 +57,7 @@ def __init__( def __str__(self) -> str: # using EDTF syntax for open ranges - return "%s/%s" % (self.earliest or "..", self.latest or "") + return f"{self.earliest or '..'}/{self.latest or ''}" def format(self, format) -> str: """format this undate interval as a string using the specified format; @@ -156,12 +155,10 @@ def __contains__(self, other: object) -> bool: # bounds of this interval return ( self.earliest is None - or other_earliest is not None - and other_earliest >= self.earliest + or (other_earliest is not None and other_earliest >= self.earliest) ) and ( self.latest is None - or other_latest is not None - and other_latest <= self.latest + or (other_latest is not None and other_latest <= self.latest) ) def intersection(self, other: "UndateInterval") -> Optional["UndateInterval"]: diff --git a/src/undate/undate.py b/src/undate/undate.py index 5ca407f..7128084 100644 --- a/src/undate/undate.py +++ b/src/undate/undate.py @@ -1,9 +1,8 @@ from __future__ import annotations import datetime -from enum import auto - import re +from enum import auto from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -17,7 +16,6 @@ from strenum import StrEnum # type: ignore # Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None -from typing import Dict, Optional, Union from undate.converters.base import BaseCalendarConverter, BaseDateConverter from undate.date import ONE_DAY, Date, DatePrecision, Timedelta, UnDelta @@ -60,7 +58,7 @@ class Undate: latest: Date #: A string to label a specific undate, e.g. "German Unity Date 2022" for Oct. 3, 2022. #: Labels are not taken into account when comparing undate objects. - label: Union[str, None] = None + label: str | None = None converter: BaseDateConverter #: precision of the date (day, month, year, etc.) precision: DatePrecision @@ -77,20 +75,20 @@ class Undate: def __init__( self, - year: Optional[Union[int, str]] = None, - month: Optional[Union[int, str]] = None, - day: Optional[Union[int, str]] = None, - converter: Optional[BaseDateConverter] = None, - label: Optional[str] = None, - calendar: Optional[Union[str, Calendar]] = None, + year: int | str | None = None, + month: int | str | None = None, + day: int | str | None = None, + converter: BaseDateConverter | None = None, + label: str | None = None, + calendar: str | Calendar | None = None, ): # everything is optional but something is required - if all([val is None for val in [year, month, day]]): + if all(val is None for val in [year, month, day]): raise ValueError("At least one of year, month, or day must be specified") # keep track of initial values and which values are known # TODO: add validation: if str, must be expected length - self.initial_values: Dict[str, Optional[Union[int, str]]] = { + self.initial_values: dict[str, int | str | None] = { "year": year, "month": month, "day": day, @@ -168,7 +166,7 @@ def calculate_earliest_latest(self, year, month, day): day = None # if day is numeric, use as is - if isinstance(day, int) or isinstance(day, str) and day.isnumeric(): + if isinstance(day, int) or (isinstance(day, str) and day.isnumeric()): day = int(day) # update initial value - fully known day self.initial_values["day"] = day @@ -177,7 +175,7 @@ def calculate_earliest_latest(self, year, month, day): # if we have no day or partial day, calculate min / max min_day = 1 # is min day ever anything other than 1 ? rel_year = year if year and isinstance(year, int) else max_year - # use month if it is an integer; otherwise use previusly determined + # use month if it is an integer; otherwise use previously determined # max month (which may not be 12 depending if partially unknown) rel_month = month if month and isinstance(month, int) else latest_month @@ -201,7 +199,7 @@ def calculate_earliest_latest(self, year, month, day): *self.calendar_converter.to_gregorian(max_year, latest_month, max_day) ) - def set_calendar(self, calendar: Union[str, Calendar]): + def set_calendar(self, calendar: str | Calendar): """Find calendar by name if passed as string and set on the object. Only intended for use at initialization time; use :meth:`as_calendar` to change calendar.""" @@ -215,7 +213,7 @@ def set_calendar(self, calendar: Union[str, Calendar]): raise ValueError(f"Calendar `{calendar}` is not supported") from err self.calendar = calendar - def as_calendar(self, calendar: Union[str, Calendar]): + def as_calendar(self, calendar: str | Calendar): """Return a new :class:`Undate` object with the same year, month, day, and labels used to initialize the current object, but with a different calendar. Note that this does NOT do calendar conversion, but reinterprets current numeric year, month, day values @@ -261,7 +259,7 @@ def __repr__(self) -> str: return f"undate.Undate({init_str})" @classmethod - def parse(cls, date_string, format) -> Union["Undate", UndateInterval]: + def parse(cls, date_string, format) -> Undate | UndateInterval: """parse a string to an undate or undate interval using the specified format; for now, only supports named converters""" converter_cls = BaseDateConverter.available_converters().get(format, None) @@ -282,7 +280,7 @@ def format(self, format) -> str: raise ValueError(f"Unsupported format '{format}'") @classmethod - def _comparison_type(cls, other: object) -> "Undate": + def _comparison_type(cls, other: object) -> Undate: """Common logic for type handling in comparison methods. Converts to Undate object if possible, otherwise raises NotImplementedError exception. Uses :meth:`to_undate` for conversion. @@ -332,8 +330,8 @@ def __eq__(self, other: object) -> bool: if looks_equal and ( # if any part of either date that is known is _partially_ known, # then these dates are not equal - any([self.is_partially_known(p) for p in self.initial_values.keys()]) - or any([other.is_partially_known(p) for p in other.initial_values.keys()]) + any(self.is_partially_known(p) for p in self.initial_values) + or any(other.is_partially_known(p) for p in other.initial_values) ): return False @@ -389,14 +387,14 @@ def __gt__(self, other: object) -> bool: # if either date has a completely unknown year, then we can't compare # NOTE: this means that gt and lt will both be false when comparing # with a date with an unknown year... - if self.unknown_year or isinstance(other, Undate) and other.unknown_year: + if self.unknown_year or (isinstance(other, Undate) and other.unknown_year): return False return not (self < other or self == other) def __le__(self, other: object) -> bool: # if either date has a completely unknown year, then we can't compare - if self.unknown_year or isinstance(other, Undate) and other.unknown_year: + if self.unknown_year or (isinstance(other, Undate) and other.unknown_year): return False return self == other or self < other @@ -430,7 +428,7 @@ def __contains__(self, other: object) -> bool: ) @classmethod - def to_undate(cls, other: object) -> "Undate": + def to_undate(cls, other: object) -> Undate: """Convert arbitrary object to Undate, if possible. Raises TypeError if conversion is not possible. @@ -481,7 +479,7 @@ def is_partially_known(self, part: str) -> bool: # and self.initial_values[part].replace(self.MISSING_DIGIT, "") != "" @property - def year(self) -> Optional[str]: + def year(self) -> str | None: "year as string (minimum 4 characters), if year is known" year = self._get_date_part("year") if year: @@ -492,7 +490,7 @@ def year(self) -> Optional[str]: return None @property - def month(self) -> Optional[str]: + def month(self) -> str | None: "month as 2-character string, or None if unknown/unset" # TODO: do we allow None for unknown month with day-level granularity? # TODO: need to distinguish between unknown (XX) and unset/not part of the date due to granularity @@ -505,7 +503,7 @@ def month(self) -> Optional[str]: return None @property - def day(self) -> Optional[str]: + def day(self) -> str | None: "day as 2-character string or None if unset" day = self._get_date_part("day") if day: @@ -516,7 +514,7 @@ def day(self) -> Optional[str]: return self.MISSING_DIGIT * 2 return None - def _get_date_part(self, part: str) -> Optional[str]: + def _get_date_part(self, part: str) -> str | None: value = self.initial_values.get(part) return str(value) if value else None @@ -589,7 +587,7 @@ def duration(self) -> Timedelta | UnDelta: # if year is known and no values are partially known, # we can calculate a time delta based on earliest + latest if self.known_year and not any( - [self.is_partially_known(part) for part in ["year", "month", "day"]] + self.is_partially_known(part) for part in ["year", "month", "day"] ): # subtract earliest from latest and add a day to include start day in the count return self.latest - self.earliest + ONE_DAY @@ -655,7 +653,7 @@ def _missing_digit_minmax( # assuming two digit only (i.e., month or day) possible_values = [f"{n:02}" for n in range(min_val, max_val + 1)] # ensure input value has two digits - value = "%02s" % value + value = f"{value:>2}" # generate regex where missing digit matches anything val_pattern = re.compile(value.replace(self.MISSING_DIGIT, ".")) # identify all possible matches, then get min and max diff --git a/tests/test_converters/edtf/test_edtf_parser.py b/tests/test_converters/edtf/test_edtf_parser.py index 73d4e02..7735d2f 100644 --- a/tests/test_converters/edtf/test_edtf_parser.py +++ b/tests/test_converters/edtf/test_edtf_parser.py @@ -1,4 +1,6 @@ import pytest +from lark.exceptions import UnexpectedCharacters + from undate.converters.edtf.parser import edtf_parser # for now, just test that valid dates can be parsed @@ -51,5 +53,5 @@ def test_should_parse(date_string): @pytest.mark.parametrize("date_string", error_cases) def test_should_error(date_string): - with pytest.raises(Exception): + with pytest.raises(UnexpectedCharacters): edtf_parser.parse(date_string) diff --git a/tests/test_converters/test_base.py b/tests/test_converters/test_base.py index 6265c15..4d67eca 100644 --- a/tests/test_converters/test_base.py +++ b/tests/test_converters/test_base.py @@ -1,7 +1,8 @@ import logging import pytest -from undate.converters.base import BaseDateConverter, BaseCalendarConverter + +from undate.converters.base import BaseCalendarConverter, BaseDateConverter from undate.converters.calendars import ( GregorianDateConverter, HebrewDateConverter, diff --git a/tests/test_converters/test_calendars/test_gregorian.py b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py similarity index 55% rename from tests/test_converters/test_calendars/test_gregorian.py rename to tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py index e0bf5ef..0b26727 100644 --- a/tests/test_converters/test_calendars/test_gregorian.py +++ b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py @@ -1,4 +1,8 @@ +import pytest + from undate.converters.calendars import GregorianDateConverter +from undate.date import DatePrecision +from undate.undate import Calendar, Undate class TestGregorianDateConverter: @@ -38,3 +42,35 @@ def test_representative_years(self): converter.LEAP_YEAR, converter.NON_LEAP_YEAR, ] + + def test_parse(self): + # day + date_str = "2022 Ugushyingo 26" + date = GregorianDateConverter().parse(date_str) + assert date == Undate(2022, 11, 26) # Ugushyingo = November + assert date.calendar == Calendar.GREGORIAN + assert date.precision == DatePrecision.DAY + assert date.label == date_str + + # month + date_str = "avril 1362" + date = GregorianDateConverter().parse(date_str) + assert date == Undate(1362, 4) + assert date.calendar == Calendar.GREGORIAN + assert date.precision == DatePrecision.MONTH + assert date.label == date_str + + # year + date_str = "932" + date = GregorianDateConverter().parse(date_str) + assert date == Undate(932) + assert date.calendar == Calendar.GREGORIAN + assert date.precision == DatePrecision.YEAR + assert date.label == date_str + + def test_parse_errors(self): + with pytest.raises(ValueError, match="empty string is not supported"): + GregorianDateConverter().parse("") + + with pytest.raises(ValueError, match="Could not parse"): + GregorianDateConverter().parse("Foo 1920") diff --git a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py new file mode 100644 index 0000000..0acd657 --- /dev/null +++ b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py @@ -0,0 +1,72 @@ +import pytest +from lark.exceptions import UnexpectedCharacters, UnexpectedEOF + +from undate.converters.calendars.gregorian.parser import gregorian_parser + +# test that valid dates can be parsed to confirm parser is working correctly + +testcases = [ + # year + "2012", + # three digit year + "566", + # month + year + "Jan 1960", + "Feb 1801", + "1900 Feb", + # day + month + year in any order + "May 5 1602", + "5 May 1602", + "1602 October 5", + # day + month + "December 5", + "5 December", + # Kinyarwanda (rw) + "2025 ugu. 4", + "2025 Ugushyingo 4", + "2025 ugu", + "2025 Ugushyingo", + # Ganda (lg) + "4 Novemba 2025", + "4 Nov 2025", + "Novemba 2025", + "4 Novemba", + # Tigrinya (ti) + "ሕዳ 4, 2025", + "ሕዳር 4 2025", + # French + "18 avril 2025", + "18 avr. 2025", + # case-insensitive + "18 JUNE 2025", + "Avril 2025", +] + + +@pytest.mark.parametrize("date_string", testcases) +def test_should_parse(date_string): + assert gregorian_parser.parse(date_string) + + +error_cases = [ + # invalid days + ("0 June 1006", UnexpectedCharacters), + ("42 March 1206", UnexpectedCharacters), + # month alone + ("Juin", UnexpectedEOF), + # day only + ("12 ", UnexpectedEOF), + # non-Gregorian month + ("5 Tammuz 5403", UnexpectedCharacters), + ("31 Tishri 5403", UnexpectedCharacters), + # invalid month + ("Foo 383", UnexpectedCharacters), + # wrong format + ("2024-10-02", UnexpectedCharacters), +] + + +@pytest.mark.parametrize("date_string,exception", error_cases) +def test_should_error(date_string, exception): + with pytest.raises(exception): + gregorian_parser.parse(date_string) diff --git a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_transformer.py b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_transformer.py new file mode 100644 index 0000000..a6107ce --- /dev/null +++ b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_transformer.py @@ -0,0 +1,31 @@ +import pytest + +from undate.converters.calendars.gregorian.parser import gregorian_parser +from undate.converters.calendars.gregorian.transformer import GregorianDateTransformer +from undate.date import DatePrecision +from undate.undate import Calendar, Undate + +testcases = [ + ("2012", Undate(2012), DatePrecision.YEAR), + ("May 13 1602", Undate(1602, 5, 13), DatePrecision.DAY), + ("Jan 1960", Undate(1960, 1), DatePrecision.MONTH), + ("2022 ugu. 4", Undate(2022, 11, 4), DatePrecision.DAY), + ("2022 Ugushyingo", Undate(2022, 11), DatePrecision.MONTH), + ("4 Novemba", Undate(month=11, day=4), DatePrecision.DAY), + # ignores whitespace, comma, period + ("4Novemba", Undate(month=11, day=4), DatePrecision.DAY), + ("18 avril, 2025", Undate(2025, 4, 18), DatePrecision.DAY), +] + + +@pytest.mark.parametrize("date_string,expected,expected_precision", testcases) +def test_transform(date_string, expected, expected_precision): + transformer = GregorianDateTransformer(visit_tokens=True) + # parse the input string, then transform to undate object + parsetree = gregorian_parser.parse(date_string) + transformed_date = transformer.transform(parsetree) + # use EDTF to compare so we can check dates with unknown years + assert transformed_date.format("EDTF") == expected.format("EDTF") + # currently only returns undate, parser doesn't support intervals + assert transformed_date.precision == expected_precision + assert transformed_date.calendar == Calendar.GREGORIAN diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py index 6fe8c96..db5df6c 100644 --- a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py +++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py @@ -2,8 +2,8 @@ from undate.converters.calendars import HebrewDateConverter from undate.converters.calendars.hebrew.transformer import HebrewUndate +from undate.date import Date, DatePrecision from undate.undate import Calendar, Undate -from undate.date import DatePrecision, Date class TestHebrewDateConverter: @@ -136,7 +136,7 @@ def test_compare_across_calendars(self): assert HebrewUndate(4816, 4, 26) > Undate(1055, 5) # 26 Tammuz 4816: Tammuz = month 4 (17 July, 1056) - # so it falls within or is c ontained by July 1056 + # so it falls within or is contained by July 1056 assert HebrewUndate(4816, 4, 26) in Undate(1056, 7) assert HebrewUndate(4816, 4, 26) not in Undate(1054) diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py index 69b929e..5810e70 100644 --- a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py +++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py @@ -3,7 +3,6 @@ from undate.converters.calendars.hebrew.parser import hebrew_parser - # for now, just test that valid dates can be parsed testcases = [ diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py index 7dcca83..ec0d1dc 100644 --- a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py +++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py @@ -1,11 +1,12 @@ import pytest + from undate.converters.calendars.hebrew.parser import hebrew_parser from undate.converters.calendars.hebrew.transformer import ( HebrewDateTransformer, HebrewUndate, ) -from undate.undate import Undate, Calendar from undate.date import DatePrecision +from undate.undate import Calendar, Undate def test_hebrew_undate(): diff --git a/tests/test_converters/test_calendars/test_islamic/test_islamic_converter.py b/tests/test_converters/test_calendars/test_islamic/test_islamic_converter.py index cfcace2..4f88c0c 100644 --- a/tests/test_converters/test_calendars/test_islamic/test_islamic_converter.py +++ b/tests/test_converters/test_calendars/test_islamic/test_islamic_converter.py @@ -2,8 +2,8 @@ from undate.converters.calendars import IslamicDateConverter from undate.converters.calendars.islamic.transformer import IslamicUndate +from undate.date import Date, DatePrecision from undate.undate import Calendar, Undate -from undate.date import DatePrecision, Date class TestIslamicDateConverter: diff --git a/tests/test_converters/test_calendars/test_islamic/test_islamic_parser.py b/tests/test_converters/test_calendars/test_islamic/test_islamic_parser.py index de4901e..c8ef39f 100644 --- a/tests/test_converters/test_calendars/test_islamic/test_islamic_parser.py +++ b/tests/test_converters/test_calendars/test_islamic/test_islamic_parser.py @@ -3,7 +3,6 @@ from undate.converters.calendars.islamic.parser import islamic_parser - # for now, just test that valid dates can be parsed testcases = [ diff --git a/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py b/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py index 04ff53b..15e8cb5 100644 --- a/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py +++ b/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py @@ -1,11 +1,12 @@ import pytest + from undate.converters.calendars.islamic.parser import islamic_parser from undate.converters.calendars.islamic.transformer import ( IslamicDateTransformer, IslamicUndate, ) -from undate.undate import Undate, Calendar from undate.date import DatePrecision +from undate.undate import Calendar, Undate def test_islamic_undate(): @@ -28,7 +29,7 @@ def test_islamic_undate(): # examples from ISMI data (reformatted to day month year) # Rabi 1 = month 3 ("14 Rabīʿ I 901", IslamicUndate(901, 3, 14), DatePrecision.DAY), - ("Rabīʿ I 490", IslamicUndate(490, 3), DatePrecision.MONTH), + ("Rabīʿ I 490", IslamicUndate(490, 3), DatePrecision.MONTH), ("884", IslamicUndate(884), DatePrecision.YEAR), # Gregorian: UndateInterval(Undate(1479, 4, 3), Undate(1480, 3, 21)), # add when we support parsing ranges: diff --git a/tests/test_converters/test_combined_parser.py b/tests/test_converters/test_combined_parser.py index 717a16e..c229880 100644 --- a/tests/test_converters/test_combined_parser.py +++ b/tests/test_converters/test_combined_parser.py @@ -1,8 +1,7 @@ import pytest -from undate.converters.combined import parser, combined_transformer - from undate import Undate, UndateInterval +from undate.converters.combined import combined_transformer, parser # test that valid dates can be parsed @@ -19,6 +18,17 @@ ("Jumādā I 1243", Undate(1243, 5, calendar="Islamic")), ("7 Jumādā I 1243", Undate(1243, 5, 7, calendar="Islamic")), ("14 Rabīʿ I 901", Undate(901, 3, 14, calendar="Islamic")), + # Gregorian with non-numeric month (full or abbreviated) + ("June 1602", Undate(1602, 6, calendar="Gregorian")), + ("13 Jan 1602", Undate(1602, 1, 13, calendar="Gregorian")), + ("2022 ugu. 4", Undate(2022, 11, 4, calendar="Gregorian")), + ("18 avril", Undate(month=4, day=18, calendar="Gregorian")), + # Christian liturgical dates + ("Easter 1942", Undate(1942, 4, 5)), + ("Epiphany 1921", Undate(1921, 1, 6)), + ("Pentecost 2016", Undate(2016, 5, 15)), + ("Ash Wednesday 2000", Undate(2000, 3, 8)), + ("Whit Monday 2023", Undate(2023, 5, 29)), # codespell:ignore whit ] diff --git a/tests/test_converters/test_edtf.py b/tests/test_converters/test_edtf.py index 3262e46..e54823a 100644 --- a/tests/test_converters/test_edtf.py +++ b/tests/test_converters/test_edtf.py @@ -1,6 +1,7 @@ import pytest -from undate.converters.edtf import EDTFDateConverter + from undate import Undate, UndateInterval +from undate.converters.edtf import EDTFDateConverter class TestEDTFDateConverter: diff --git a/tests/test_converters/test_holidays.py b/tests/test_converters/test_holidays.py new file mode 100644 index 0000000..1aa21a3 --- /dev/null +++ b/tests/test_converters/test_holidays.py @@ -0,0 +1,115 @@ +import pytest +from lark import Token, Tree + +from undate import Calendar, Undate +from undate.converters.holidays import HolidayDateConverter, HolidayTransformer +from undate.date import Weekday + + +class TestHolidayConverter: + converter = HolidayDateConverter() + + @pytest.mark.parametrize( + "input_string,expected", + [ + ("Epiphany 1921", Undate(1921, 1, 6)), + ("candlemas 1913", Undate(1913, 2, 2)), + ("Candlemass 1862", Undate(1862, 2, 2)), + ("st. patrick's day 1823", Undate(1823, 3, 17)), + ("st patrick's day 1901", Undate(1901, 3, 17)), + ("all fools day 1933", Undate(1933, 4, 1)), + ("st. cyprian's day 1902", Undate(1902, 9, 16)), + ], + ) + def test_fixed_holidays(self, input_string, expected): + assert self.converter.parse(input_string) == expected + + @pytest.mark.parametrize( + "input_string,expected,expected_weekday", + [ + ("Easter 1900", Undate(1900, 4, 15), Weekday.SUNDAY), + ("easter monday 1925", Undate(1925, 4, 13), Weekday.MONDAY), + ("holy saturday 2018", Undate(2018, 3, 31), Weekday.SATURDAY), + ("Ash Wednesday 2000", Undate(2000, 3, 8), Weekday.WEDNESDAY), + ("shrove tuesday 1940", Undate(1940, 2, 6), Weekday.TUESDAY), + ("Ascension 1988", Undate(1988, 5, 12), Weekday.THURSDAY), + ("Ascension Day 1999", Undate(1999, 5, 13), Weekday.THURSDAY), + ("Pentecost 2016", Undate(2016, 5, 15), Weekday.SUNDAY), + ( + "whit monday 2005", # codespell:ignore whit + Undate(2005, 5, 16), + Weekday.MONDAY, + ), + ("whitsun monday 2023", Undate(2023, 5, 29), Weekday.MONDAY), + ("trinity 1978", Undate(1978, 5, 21), Weekday.SUNDAY), + ("Trinity Sunday 1967", Undate(1967, 5, 21), Weekday.SUNDAY), + ], + ) + def test_moveable_feasts(self, input_string, expected, expected_weekday): + result = self.converter.parse(input_string) + assert result == expected + assert result.label == input_string + assert result.earliest.weekday == expected_weekday + + def test_holiday_without_year(self): + result = self.converter.parse("Epiphany") + assert result.label == "Epiphany" + assert result.format("EDTF") == "XXXX-01-06" + assert not result.known_year + assert result.calendar == Calendar.GREGORIAN + + def test_undate_parse(self): + # accessible through main undate parse method + assert Undate.parse("Epiphany 1942", "holidays") == Undate(1942, 1, 6) + + def test_parse_empty(self): + with pytest.raises(ValueError, match="empty string"): + self.converter.parse("") + + def test_parse_error(self): + with pytest.raises(ValueError, match="Could not parse"): + self.converter.parse("Not a holiday") + + def test_moveable_without_year(self): + with pytest.raises(ValueError, match="Could not parse"): + self.converter.parse("Easter") + + def test_to_string_error(self): + with pytest.raises(ValueError, match="does not support"): + self.converter.to_string(Undate(1916)) + + +# edge cases - should not happen from parser input but possible + + +class TestHolidayTransformer: + def test_fixed_date(self): + transformer = HolidayTransformer() + result = transformer.fixed_date([Token("EPIPHANY", "")]) + assert isinstance(result, Tree) + assert len(result.children) == 2 + assert all(isinstance(child, Token) for child in result.children) + assert result.children[0].type == "month" + assert result.children[0].value == 1 + assert result.children[1].type == "day" + assert result.children[1].value == 6 + + # namespaced token + result = transformer.fixed_date([Token("holiday__EPIPHANY", "")]) + assert isinstance(result, Tree) + assert len(result.children) == 2 + assert all(isinstance(child, Token) for child in result.children) + assert result.children[0].type == "month" + assert result.children[0].value == 1 + assert result.children[1].type == "day" + assert result.children[1].value == 6 + + # unknown fixed holiday should raise value error + with pytest.raises(ValueError, match="Unknown fixed holiday"): + transformer.fixed_date([Token("epiphany", "")]) + + def test_get_date_parts(self): + transformer = HolidayTransformer() + # movable feast without year is not supported + with pytest.raises(ValueError, match="Year is required"): + transformer._get_date_parts([Token("EASTER", "")]) diff --git a/tests/test_date.py b/tests/test_date.py index fc6cc72..e0eb5d7 100644 --- a/tests/test_date.py +++ b/tests/test_date.py @@ -5,8 +5,8 @@ from undate.date import ( ONE_DAY, - ONE_YEAR, ONE_MONTH_MAX, + ONE_YEAR, Date, DatePrecision, Timedelta, @@ -159,7 +159,7 @@ def test_gt(self): assert not ten_twelve > UnInt(13, 23) # unsupported type with pytest.raises(TypeError): - ten_twelve > "three" + assert ten_twelve > "three" def test_lt(self): ten_twelve = UnInt(10, 12) @@ -173,7 +173,7 @@ def test_lt(self): assert not ten_twelve < UnInt(2, 4) # unsupported type with pytest.raises(TypeError): - ten_twelve < "three" + assert ten_twelve < "three" def test_iterable(self): anymonth_days = UnInt(lower=28, upper=31) diff --git a/tests/test_interval.py b/tests/test_interval.py index dbf28b3..828833c 100644 --- a/tests/test_interval.py +++ b/tests/test_interval.py @@ -63,7 +63,7 @@ def test_repr(self): closed_interval = UndateInterval(Undate(2022), Undate(2023)) assert ( repr(closed_interval) - == f"undate.UndateInterval(earliest={repr(closed_interval.earliest)}, latest={repr(closed_interval.latest)})" + == f"undate.UndateInterval(earliest={closed_interval.earliest!r}, latest={closed_interval.latest!r})" ) # should be able to evaluate repr string to get an equivalent object assert eval(repr(closed_interval)) == closed_interval @@ -71,7 +71,7 @@ def test_repr(self): fancy_epoch = UndateInterval(Undate(2022), Undate(2023), label="Fancy Epoch") assert ( repr(fancy_epoch) - == f"undate.UndateInterval(earliest={repr(fancy_epoch.earliest)}, latest={repr(fancy_epoch.latest)}, label='Fancy Epoch')" + == f"undate.UndateInterval(earliest={fancy_epoch.earliest!r}, latest={fancy_epoch.latest!r}, label='Fancy Epoch')" ) assert eval(repr(fancy_epoch)) == fancy_epoch @@ -80,7 +80,7 @@ def test_repr(self): ) assert ( repr(open_interval) - == f"undate.UndateInterval(earliest={repr(open_interval.earliest)})" + == f"undate.UndateInterval(earliest={open_interval.earliest!r})" ) assert eval(repr(open_interval)) == open_interval diff --git a/tests/test_undate.py b/tests/test_undate.py index 2aa855d..3d65667 100644 --- a/tests/test_undate.py +++ b/tests/test_undate.py @@ -4,10 +4,10 @@ import pytest -from undate import Undate, UndateInterval, Calendar -from undate.undate import StrEnum # import whichever version is used there +from undate import Calendar, Undate, UndateInterval from undate.converters.base import BaseCalendarConverter, BaseDateConverter from undate.date import Date, DatePrecision, Timedelta, UnDelta, UnInt +from undate.undate import StrEnum # import whichever version is used there class TestUndate: @@ -259,7 +259,7 @@ def test_eq(self): assert Undate(2022, 10) == Undate(2022, 10) assert Undate(2022, 10, 1) == Undate(2022, 10, 1) # dates without a known year cannot known to be equal - assert not Undate(month=2, day=7) == Undate(month=2, day=7) + assert Undate(month=2, day=7) != Undate(month=2, day=7) # something we can't convert for comparison should return NotImplemented assert Undate(2022).__eq__("not a date") == NotImplemented