-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwikitext_cache.py
More file actions
28 lines (23 loc) · 1.05 KB
/
wikitext_cache.py
File metadata and controls
28 lines (23 loc) · 1.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Local WikiText cache loader used by the ABI proof scripts."""
import pathlib
from datasets import Dataset, load_dataset
def load_wikitext_split(config: str = "wikitext-2-raw-v1", split: str = "train"):
"""
Load WikiText from the local Hugging Face Arrow cache when available.
On this Windows setup, load_dataset(..., offline) can spend minutes resolving
a cached builder. Dataset.from_file() loads the already-present Arrow split
directly and keeps proof reruns reproducible without changing corpus content.
"""
cache_root = (pathlib.Path.home() / ".cache" / "huggingface" /
"datasets" / "wikitext" / config)
candidates = sorted(
cache_root.glob(f"**/wikitext-{split}.arrow"),
key=lambda p: p.stat().st_mtime,
reverse=True,
)
if candidates:
print(f" [data] using cached {config}/{split}: {candidates[0]}", flush=True)
return Dataset.from_file(str(candidates[0]))
return load_dataset("wikitext", config, split=split)