-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathcommon_utils.py
More file actions
55 lines (50 loc) · 2.43 KB
/
common_utils.py
File metadata and controls
55 lines (50 loc) · 2.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import json
import re
from typing import Any, Dict, List
import requests
def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
"""
Reads a JSONL file where each line is a valid JSON object and returns a list of these objects.
Args:
file_path: Path to the JSONL file.
Returns:
A list of dictionaries, where each dictionary is a parsed JSON object from a line.
Returns an empty list if the file is not found or if errors occur during parsing. Supports HTTP urls and local file paths.
"""
data: List[Dict[str, Any]] = []
if file_path.startswith("http://") or file_path.startswith("https://"):
resp = requests.get(file_path, stream=True, timeout=30)
resp.raise_for_status()
for line_number, raw in enumerate(resp.iter_lines(decode_unicode=True), start=1):
if raw is None:
continue
stripped = raw.strip()
if not stripped:
continue
try:
data.append(json.loads(stripped))
except json.JSONDecodeError as e:
print(f"Error parsing JSON line for URL {file_path} at line {line_number}")
row_id_index = stripped.find("row_id")
if row_id_index != -1:
row_id = re.search(r'"row_id": (.*),', stripped[row_id_index:])
raise ValueError(f"{e.msg} at line {line_number}: {stripped} ({row_id})") from e
raise e
else:
with open(file_path, "r", encoding="utf-8") as f:
for line_number, line in enumerate(f, start=1):
# Skip entirely blank or whitespace-only lines to be robust to trailing newlines
stripped = line.strip()
if not stripped:
continue
try:
data.append(json.loads(stripped))
except json.JSONDecodeError as e:
print(f"Error parsing JSON line for file {file_path} at line {line_number}")
# attempt to find "row_id" in the line by finding index of "row_id" and performing regex of `"row_id": (.*),`
row_id_index = line.find("row_id")
if row_id_index != -1:
row_id = re.search(r'"row_id": (.*),', line[row_id_index:])
raise ValueError(f"{e.msg} at line {line_number}: {line} ({row_id})") from e
raise e
return data