-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathprocess.py
More file actions
24 lines (22 loc) · 908 Bytes
/
Copy pathprocess.py
File metadata and controls
24 lines (22 loc) · 908 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import json
import sys
def process_file(input_file, output_file, max_words=10000):
line_count = 0
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
for line in infile:
line_count += 1
data = json.loads(line)
if 'meta' in data:
del data['meta']
text = data['text'].split(' ')
# split overly long samples, otherwise tokenization might get stuck
while len(text) > max_words:
data['text'] = ' '.join(text[:max_words])
outfile.write(json.dumps(data) + '\n')
text = text[max_words:]
data['text'] = ' '.join(text)
outfile.write(json.dumps(data) + '\n')
i = int(sys.argv[1])
input_file = f"{i:02d}.jsonl"
output_file = f"{i:02d}.text.jsonl"
process_file(input_file, output_file)