Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/harmony/parsing/text_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def convert_text_to_instruments(file: RawFile) -> List[Instrument]:
elif "," in first_line:
csv_sep = ","

if file.file_type == FileType.csv and csv_sep is not None:
string_io = StringIO(page_text)
df = pd.read_csv(string_io, sep=csv_sep)
df.fillna("", inplace=True)
Expand Down
17 changes: 17 additions & 0 deletions tests/test_convert_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,23 @@ def test_remove_both_ends_digits_from_csv(self):
self.assertEqual("How are you today", questions[0].question_text)
self.assertEqual("Are you feeling better", questions[1].question_text)

def test_csv_without_detected_separator_falls_back_to_line_parsing(self):
# Regression: pandas >=2.3 dropped support for pd.read_csv(sep=None).
# A CSV whose first line contains no tab or comma must parse via the
# txt branch (one question per line), not via pd.read_csv(sep=None).
no_sep_csv = RawFile.model_validate({
"file_id": "no_sep_csv",
"file_name": "no_separator.csv",
"file_type": "csv",
"content": "I feel anxious\nI feel restless"
})
instruments = convert_text_to_instruments(no_sep_csv)
self.assertEqual(1, len(instruments))
questions = instruments[0].questions
self.assertEqual(2, len(questions))
self.assertEqual("I feel anxious", questions[0].question_text)
self.assertEqual("I feel restless", questions[1].question_text)


if __name__ == '__main__':
unittest.main()
Loading