From 98911fd8bec14ee7a1823c160858a8d9e10468d7 Mon Sep 17 00:00:00 2001 From: mcarans Date: Thu, 19 Feb 2026 14:05:50 +1300 Subject: [PATCH] Add smart_split function to split blocks of text into paragraphs --- documentation/index.md | 3 + src/hdx/utilities/text.py | 62 +++++++++++++++++++ tests/fixtures/text/text1.txt | 1 + tests/fixtures/text/text2.txt | 1 + tests/hdx/conftest.py | 8 +++ tests/hdx/utilities/test_downloader.py | 7 --- tests/hdx/utilities/test_retriever.py | 7 --- tests/hdx/utilities/test_text.py | 84 ++++++++++++++++++++++++++ 8 files changed, 159 insertions(+), 14 deletions(-) create mode 100644 tests/fixtures/text/text1.txt create mode 100644 tests/fixtures/text/text2.txt diff --git a/documentation/index.md b/documentation/index.md index 2a29a3c9..af6b54c8 100644 --- a/documentation/index.md +++ b/documentation/index.md @@ -941,6 +941,9 @@ Examples: result = get_words_in_sentence("Korea (Democratic People's Republic of)") assert result == ["Korea", "Democratic", "People's", "Republic", "of"] + # Split wall of text into paragraphs in a smart and efficient way + result = smart_split("MY WALL OF TEXT") + ## Stable file hashing Efficient Hashing of files that produces a stable hash even for zip and xlsx files eg. diff --git a/src/hdx/utilities/text.py b/src/hdx/utilities/text.py index 32dd7bf1..deb238a1 100755 --- a/src/hdx/utilities/text.py +++ b/src/hdx/utilities/text.py @@ -261,3 +261,65 @@ def get_int_value(val, denominator): val = val.replace(",", ".") return float(val) / denominator return value + + +def smart_split(text: str, target_length: int = 400) -> str: + """ + Splits text into paragraphs based on sentence length and structure. + + Args: + text: The input text. + target_length: Approximate characters per paragraph. Defaults to 400. + + Returns: + The output text + """ + + # 1. Clean up and split into sentences + # Look for (.!?) followed by a space and a capital letter + sentences = re.split(r"(?<=[.!?])\s+(?=[A-Z])", text.strip()) + + paragraphs = [] + current_para = [] + current_length = 0 + + for i, sentence in enumerate(sentences): + sentence = sentence.strip() + + # 2. Analyze the sentence start + first_word = sentence.split()[0].replace(",", "") + + # Score: How likely is this a new paragraph starter? + score = 0 + + # Rule A: Proper Nouns/Acronyms often start new topics + # (e.g., "The WDPCA", "UNEP-WCMC", "Data") + if first_word[0].isupper() and len(first_word) > 1: + score += 1 + + # Rule B: Weak Starters (Conjunctions) -> Force stay + if first_word.lower() in ["and", "but", "or", "so", "because"]: + score -= 5 + + # Rule C: Length Pressure + # If we are way over the target length, force a split on any decent sentence + if current_length > target_length: + score += 3 + elif current_length < (target_length / 2): + score -= 5 # Too short to split yet + + # 3. Decision Time + # If score is high enough (and we aren't at the very first sentence) + if score >= 2 and current_para: + paragraphs.append(" ".join(current_para)) + current_para = [sentence] + current_length = len(sentence) + else: + current_para.append(sentence) + current_length += len(sentence) + + # Append whatever is left + if current_para: + paragraphs.append(" ".join(current_para)) + + return "\n\n".join(paragraphs) diff --git a/tests/fixtures/text/text1.txt b/tests/fixtures/text/text1.txt new file mode 100644 index 00000000..66445947 --- /dev/null +++ b/tests/fixtures/text/text1.txt @@ -0,0 +1 @@ +The World Database on Protected and Conserved Areas (WDPCA) combines the formerly separate World Database on Protected Areas (WDPA) and World Database on Other Effective Area-based Conservation Measures (WD-OECM). The WDPCA is the most comprehensive global database of marine and terrestrial protected areas and other effective area-based conservation measures, updated on a monthly basis, and is one of the key global biodiversity datasets being widely used by scientists, businesses, governments, international secretariats, and others to inform planning, policy decisions, and management. The WDPCA is part of the Protected Planet Initiative, a joint product of the UN Environment Programme and the International Union for Conservation of Nature (IUCN). The compilation and management of the WDPCA is carried out by the UN Environment Programme World Conservation Monitoring Centre (UNEP-WCMC), in collaboration with governments and other stakeholders. Data and information on the world's protected and conserved areas compiled in the WDPCA is used for reporting on progress towards reaching Target 3 of the Kunming-Montreal Global Biodiversity Framework, which calls for 30% of the world’s land and waters to be effectively conserved by 2030. Additionally, the WDPCA is used for reporting to the UN to track progress towards the 2030 Sustainable Development Goals, tracking of core indicators of the Intergovernmental Science-Policy Platform on Biodiversity and Ecosystem Services (IPBES), and providing information for other international assessments and reports including the Global Biodiversity Outlook. UNEP-WCMC and IUCN periodically release the Protected Planet Report on the status of the world's protected and conserved areas. Many platforms are incorporating the WDPCA to provide integrated information to diverse users, including businesses and governments, in a range of sectors. For example, the WDPCA is included in the Integrated Biodiversity Assessment Tool (IBAT), an innovative decision support tool that gives commercial users easy access to up-to-date information that allows them to identify biodiversity risks and opportunities within a project boundary. The reach of the WDPCA is further enhanced by the UN Biodiversity Lab as well as services developed by other parties, such as the Global Forest Watch and the Digital Observatory for Protected Areas, which provide decision makers with access to monitoring and alert systems that allow whole landscapes to be managed better. Together, these applications of the WDPCA demonstrate the growing value and significance of the Protected Planet initiative. diff --git a/tests/fixtures/text/text2.txt b/tests/fixtures/text/text2.txt new file mode 100644 index 00000000..ddf80047 --- /dev/null +++ b/tests/fixtures/text/text2.txt @@ -0,0 +1 @@ +UNHCR, the UN Refugee Agency, is a global organization dedicated to saving lives, protecting rights and building a better future for people forced to flee their homes because of conflict and persecution. We lead international action to protect refugees, forcibly displaced communities and stateless people. Our vision is a world where every person forced to flee can build a better future. Formally known as the Office of the High Commissioner for Refugees, UNHCR was established by the General Assembly of the United Nations in 1950 in the aftermath of the Second World War to help the millions of people who had lost their homes. Today, UNHCR works in 128 countries. We provide life-saving assistance, including shelter, food, water and medical care for people forced to flee conflict and persecution, many of whom have nobody left to turn to. We defend their right to reach safety and help them find a place to call home so they can rebuild their lives. Long term, we work with countries to improve and monitor refugee and asylum laws and policies, ensuring human rights are upheld. In everything we do UNHCR considers refugees and those forced to flee as partners, putting those most affected at the centre of planning and decision-making. diff --git a/tests/hdx/conftest.py b/tests/hdx/conftest.py index 77896bc7..8dc8f302 100755 --- a/tests/hdx/conftest.py +++ b/tests/hdx/conftest.py @@ -6,6 +6,7 @@ import pytest from hdx.utilities.downloader import Download +from hdx.utilities.useragent import UserAgent @pytest.fixture(scope="session") @@ -23,6 +24,13 @@ def fixtureurl(): return "https://raw.githubusercontent.com/OCHA-DAP/hdx-python-utilities/master/tests/fixtures/test_data.csv" +@pytest.fixture(scope="session", autouse=True) +def useragent(): + UserAgent.set_global("test") + yield + UserAgent.clear_global() + + @pytest.fixture(scope="function") def mocksmtp(monkeypatch): class MockSMTPBase: diff --git a/tests/hdx/utilities/test_downloader.py b/tests/hdx/utilities/test_downloader.py index ad0d1865..2e46b214 100755 --- a/tests/hdx/utilities/test_downloader.py +++ b/tests/hdx/utilities/test_downloader.py @@ -17,7 +17,6 @@ from hdx.utilities.downloader import Download from hdx.utilities.session import SessionError from hdx.utilities.url import get_path_for_url -from hdx.utilities.useragent import UserAgent @contextmanager @@ -34,12 +33,6 @@ def not_raises(ExpectedException): class TestDownloader: downloaderfoldername = "downloader" - @pytest.fixture(scope="class", autouse=True) - def useragent(self): - UserAgent.set_global("test") - yield - UserAgent.clear_global() - @pytest.fixture(scope="class") def downloaderfolder(self, fixturesfolder): return fixturesfolder / self.downloaderfoldername diff --git a/tests/hdx/utilities/test_retriever.py b/tests/hdx/utilities/test_retriever.py index d69cd65c..a2f363bb 100755 --- a/tests/hdx/utilities/test_retriever.py +++ b/tests/hdx/utilities/test_retriever.py @@ -10,18 +10,11 @@ from hdx.utilities.downloader import Download, DownloadError from hdx.utilities.retriever import Retrieve -from hdx.utilities.useragent import UserAgent class TestRetriever: retrieverfoldername = "retriever" - @pytest.fixture(scope="class", autouse=True) - def useragent(self): - UserAgent.set_global("test") - yield - UserAgent.clear_global() - @pytest.fixture(scope="class") def retrieverfolder(self, fixturesfolder): return fixturesfolder / self.retrieverfoldername diff --git a/tests/hdx/utilities/test_text.py b/tests/hdx/utilities/test_text.py index fa5ca3bd..42acbf4f 100755 --- a/tests/hdx/utilities/test_text.py +++ b/tests/hdx/utilities/test_text.py @@ -2,8 +2,10 @@ from string import punctuation, whitespace +import pytest from pytest import approx +from hdx.utilities.downloader import Download from hdx.utilities.text import ( PUNCTUATION_MINUS_BRACKETS, get_fraction_str, @@ -15,10 +17,21 @@ remove_end_characters, remove_from_end, remove_string, + smart_split, ) class TestText: + @pytest.fixture(scope="function") + def text1(self, fixturesfolder): + with Download() as downloader: + return downloader.download_text(fixturesfolder / "text" / "text1.txt") + + @pytest.fixture(scope="function") + def text2(self, fixturesfolder): + with Download() as downloader: + return downloader.download_text(fixturesfolder / "text" / "text2.txt") + def test_normalise(self): assert ( normalise("£^*& ()+-[]<>?|\ Al DhaleZ'eÉ / الضالع,,..1234''#~~### ") @@ -113,3 +126,74 @@ def test_get_numeric_if_possible(self): assert get_numeric_if_possible("123,123.45%") == 1231.2345 assert get_numeric_if_possible("-123,123.45%") == -1231.2345 assert get_numeric_if_possible("123.123,45%") == 1231.2345 + + def test_smart_split(self, text1, text2): + result = smart_split(text1) + assert result == ( + "The World Database on Protected and Conserved Areas (WDPCA) combines the " + "formerly separate World Database on Protected Areas (WDPA) and World " + "Database on Other Effective Area-based Conservation Measures (WD-OECM). The " + "WDPCA is the most comprehensive global database of marine and terrestrial " + "protected areas and other effective area-based conservation measures, " + "updated on a monthly basis, and is one of the key global biodiversity " + "datasets being widely used by scientists, businesses, governments, " + "international secretariats, and others to inform planning, policy decisions, " + "and management.\n" + "\n" + "The WDPCA is part of the Protected Planet Initiative, a joint product of the " + "UN Environment Programme and the International Union for Conservation of " + "Nature (IUCN). The compilation and management of the WDPCA is carried out by " + "the UN Environment Programme World Conservation Monitoring Centre " + "(UNEP-WCMC), in collaboration with governments and other stakeholders. Data " + "and information on the world's protected and conserved areas compiled in the " + "WDPCA is used for reporting on progress towards reaching Target 3 of the " + "Kunming-Montreal Global Biodiversity Framework, which calls for 30% of the " + "world’s land and waters to be effectively conserved by 2030.\n" + "\n" + "Additionally, the WDPCA is used for reporting to the UN to track progress " + "towards the 2030 Sustainable Development Goals, tracking of core indicators " + "of the Intergovernmental Science-Policy Platform on Biodiversity and " + "Ecosystem Services (IPBES), and providing information for other " + "international assessments and reports including the Global Biodiversity " + "Outlook. UNEP-WCMC and IUCN periodically release the Protected Planet Report " + "on the status of the world's protected and conserved areas.\n" + "\n" + "Many platforms are incorporating the WDPCA to provide integrated information " + "to diverse users, including businesses and governments, in a range of " + "sectors. For example, the WDPCA is included in the Integrated Biodiversity " + "Assessment Tool (IBAT), an innovative decision support tool that gives " + "commercial users easy access to up-to-date information that allows them to " + "identify biodiversity risks and opportunities within a project boundary.\n" + "\n" + "The reach of the WDPCA is further enhanced by the UN Biodiversity Lab as " + "well as services developed by other parties, such as the Global Forest Watch " + "and the Digital Observatory for Protected Areas, which provide decision " + "makers with access to monitoring and alert systems that allow whole " + "landscapes to be managed better. Together, these applications of the WDPCA " + "demonstrate the growing value and significance of the Protected Planet " + "initiative." + ) + result = smart_split(text2) + assert result == ( + "UNHCR, the UN Refugee Agency, is a global organization dedicated to saving " + "lives, protecting rights and building a better future for people forced to " + "flee their homes because of conflict and persecution. We lead international " + "action to protect refugees, forcibly displaced communities and stateless " + "people. Our vision is a world where every person forced to flee can build a " + "better future. Formally known as the Office of the High Commissioner for " + "Refugees, UNHCR was established by the General Assembly of the United " + "Nations in 1950 in the aftermath of the Second World War to help the " + "millions of people who had lost their homes.\n" + "\n" + "Today, UNHCR works in 128 countries. We provide life-saving assistance, " + "including shelter, food, water and medical care for people forced to flee " + "conflict and persecution, many of whom have nobody left to turn to. We " + "defend their right to reach safety and help them find a place to call home " + "so they can rebuild their lives. Long term, we work with countries to " + "improve and monitor refugee and asylum laws and policies, ensuring human " + "rights are upheld.\n" + "\n" + "In everything we do UNHCR considers refugees and those forced to flee as " + "partners, putting those most affected at the centre of planning and " + "decision-making." + )