Skip to content

Commit 6ca3847

Browse files
authored
Merge pull request #3 from arangoml/jsonl_support
adds jsonl support
2 parents 500b248 + 39bfe22 commit 6ca3847

File tree

3 files changed

+188
-35
lines changed

3 files changed

+188
-35
lines changed

arango_datasets/datasets.py

Lines changed: 62 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1+
import json
12
import sys
2-
from typing import Any, Dict, List, Optional
3+
from typing import Any, Dict, List
34

45
import requests
56
from arango.collection import StandardCollection
@@ -25,13 +26,14 @@ class Datasets:
2526
def __init__(
2627
self,
2728
db: Database,
28-
batch_size: Optional[int] = None,
29+
batch_size: int = 50,
2930
metadata_file: str = "https://arangodb-dataset-library.s3.amazonaws.com/root_metadata.json", # noqa: E501
3031
):
3132
self.metadata_file: str = metadata_file
3233
self.metadata_contents: Dict[str, Any]
3334
self.batch_size = batch_size
3435
self.user_db = db
36+
self.file_type: str
3537
if issubclass(type(db), Database) is False:
3638
msg = "**db** parameter must inherit from arango.database.Database"
3739
raise TypeError(msg)
@@ -74,36 +76,80 @@ def insert_docs(
7476
except DocumentInsertError as exec:
7577
print("Document insertion failed due to the following error:")
7678
print(exec.message)
79+
sys.exit(1)
7780

7881
print(f"Finished loading current file for collection: {collection_name}")
7982

80-
def load_file(self, collection_name: str, edge_type: bool, file_url: str) -> None:
81-
collection: StandardCollection
82-
83+
def load_json(
84+
self,
85+
collection_name: str,
86+
edge_type: bool,
87+
file_url: str,
88+
collection: StandardCollection,
89+
) -> None:
8390
try:
84-
collection = self.user_db.create_collection(collection_name, edge=edge_type)
85-
except CollectionCreateError as exec:
86-
print(
87-
f"""Failed to create {collection_name} collection due
88-
to the following error:"""
89-
)
90-
print(exec.error_message)
91-
sys.exit(1)
91+
with progress(f"Downloading file for: {collection_name}") as p:
92+
p.add_task("load_file")
93+
data = requests.get(file_url).json()
94+
except (HTTPError, ConnectionError) as e:
95+
print("Unable to download file.")
96+
print(e)
97+
raise e
98+
print(f"Downloaded file for: {collection_name}, now importing... ")
99+
self.insert_docs(collection, data, collection_name)
92100

101+
def load_jsonl(
102+
self,
103+
collection_name: str,
104+
edge_type: bool,
105+
file_url: str,
106+
collection: StandardCollection,
107+
) -> None:
108+
json_data = []
93109
try:
94110
with progress(f"Downloading file for: {collection_name}") as p:
95111
p.add_task("load_file")
96-
data = requests.get(file_url).json()
112+
data = requests.get(file_url)
113+
114+
if data.encoding is None:
115+
data.encoding = "utf-8"
116+
117+
for line in data.iter_lines(decode_unicode=True):
118+
if line:
119+
json_data.append(json.loads(line))
120+
97121
except (HTTPError, ConnectionError) as e:
98122
print("Unable to download file.")
99123
print(e)
100124
raise
101125
print(f"Downloaded file for: {collection_name}, now importing... ")
102-
self.insert_docs(collection, data, collection_name)
126+
self.insert_docs(collection, json_data, collection_name)
103127

104-
def load(self, dataset_name: str) -> None:
128+
def load_file(self, collection_name: str, edge_type: bool, file_url: str) -> None:
129+
collection: StandardCollection
130+
try:
131+
collection = self.user_db.create_collection(
132+
collection_name, edge=edge_type
133+
) # type: ignore
134+
except CollectionCreateError as exec:
135+
print(
136+
f"""Failed to create {collection_name} collection due
137+
to the following error:"""
138+
)
139+
print(exec.error_message)
140+
sys.exit(1)
141+
if self.file_type == "json":
142+
self.load_json(collection_name, edge_type, file_url, collection)
143+
elif self.file_type == "jsonl":
144+
self.load_jsonl(collection_name, edge_type, file_url, collection)
145+
else:
146+
raise ValueError(f"Unsupported file type: {self.file_type}")
105147

148+
def load(self, dataset_name: str) -> None:
106149
if str(dataset_name).upper() in self.labels:
150+
self.file_type = self.metadata_contents[str(dataset_name).upper()][
151+
"file_type"
152+
]
107153

108154
for edge in self.metadata_contents[str(dataset_name).upper()]["edges"]:
109155
for e in edge["files"]:

tests/conftest.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,14 @@ def pytest_configure(config: Any) -> None:
3838

3939
def cleanup_collections() -> None:
4040
global db
41-
if db.has_collection("flights"):
41+
if db.has_collection("test_vertex"):
4242
try:
43-
db.delete_collection("flights")
44-
except (CollectionDeleteError):
45-
print("unable to delete flights")
43+
db.delete_collection("test_vertex")
44+
except CollectionDeleteError:
45+
print("unable to delete test_vertex")
4646

47-
if db.has_collection("airports"):
47+
if db.has_collection("test_edge"):
4848
try:
49-
db.delete_collection("airports")
50-
except (CollectionDeleteError):
51-
print("unable to delete airports")
49+
db.delete_collection("test_edge")
50+
except CollectionDeleteError:
51+
print("unable to delete test_edge")

tests/test_main.py

Lines changed: 118 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,25 @@
11
from typing import Any, no_type_check
22

33
import pytest
4+
from requests import ConnectionError
45

56
from arango_datasets.datasets import Datasets
67

7-
from .conftest import db
8+
from .conftest import cleanup_collections, db
89

10+
global test_metadata_url
11+
global root_metadata_url
12+
global bad_metadata_url
13+
test_metadata_url = (
14+
"https://arangodb-dataset-library.s3.amazonaws.com/test_metadata.json" # noqa: E501
15+
)
16+
root_metadata_url = (
17+
"https://arangodb-dataset-library.s3.amazonaws.com/root_metadata.json" # noqa: E501
18+
)
19+
bad_metadata_url = "http://bad_url.arangodb.com/"
920

21+
22+
@no_type_check
1023
def test_dataset_constructor() -> None:
1124
assert Datasets(db)
1225
assert Datasets(db, batch_size=1000)
@@ -17,21 +30,31 @@ def test_dataset_constructor() -> None:
1730
assert Datasets(
1831
db,
1932
batch_size=1000,
20-
metadata_file="https://arangodb-dataset-library.s3.amazonaws.com/root_metadata.json", # noqa: E501
33+
metadata_file=root_metadata_url,
2134
)
35+
with pytest.raises(TypeError):
36+
assert Datasets(
37+
db="some none db object",
38+
batch_size=1000,
39+
metadata_file=root_metadata_url,
40+
)
2241
with pytest.raises(Exception):
2342
assert Datasets({})
2443

25-
with pytest.raises(Exception):
26-
assert Datasets(db, metadata_file="bad_url")
44+
with pytest.raises(ConnectionError):
45+
assert Datasets(db, metadata_file=bad_metadata_url)
2746

2847

48+
@no_type_check
2949
def test_list_datasets(capfd: Any) -> None:
30-
datasets = Datasets(db).list_datasets()
50+
datasets = Datasets(
51+
db,
52+
metadata_file=test_metadata_url,
53+
).list_datasets()
3154
out, err = capfd.readouterr()
32-
assert "FLIGHTS" in out
55+
assert "TEST" in out
3356
assert type(datasets) is list
34-
assert "FLIGHTS" in datasets
57+
assert "TEST" in datasets
3558

3659

3760
@no_type_check
@@ -42,17 +65,101 @@ def test_dataset_info(capfd: Any) -> None:
4265
with pytest.raises(Exception):
4366
Datasets(db).dataset_info(2)
4467

45-
dataset = Datasets(db).dataset_info("FLIGHTS")
68+
dataset = Datasets(
69+
db,
70+
metadata_file=test_metadata_url,
71+
).dataset_info("TEST")
4672
assert type(dataset) is dict
4773

74+
assert dataset["TEST"]["file_type"] == "json"
75+
4876
out, err = capfd.readouterr()
4977
assert len(out) > 0
5078

5179

80+
@no_type_check
81+
def test_load_file() -> None:
82+
with pytest.raises(Exception):
83+
Datasets.load_file(collection_name="test", edge_type=None, file_url="false")
84+
85+
86+
@no_type_check
87+
def test_load_json() -> None:
88+
cleanup_collections()
89+
collection_name = "test_vertex"
90+
edge_type = False
91+
file_url = "https://arangodb-dataset-library.s3.amazonaws.com/test_files/json/vertex_collection/test_vertex.json" # noqa: E501
92+
collection = db.create_collection("test_vertex")
93+
assert None == (
94+
Datasets.load_json(
95+
Datasets(db),
96+
collection_name=collection_name,
97+
edge_type=edge_type,
98+
file_url=file_url,
99+
collection=collection,
100+
)
101+
)
102+
103+
104+
@no_type_check
105+
def json_bad_url() -> None:
106+
cleanup_collections()
107+
collection_name = "test_vertex"
108+
edge_type = False
109+
collection = db.create_collection("test_vertex")
110+
111+
with pytest.raises(ConnectionError):
112+
Datasets.load_json(
113+
Datasets(db),
114+
collection_name=collection_name,
115+
edge_type=edge_type,
116+
file_url=bad_metadata_url,
117+
collection=collection,
118+
)
119+
120+
121+
@no_type_check
122+
def test_load_jsonl() -> None:
123+
cleanup_collections()
124+
collection_name = "test_vertex"
125+
edge_type = False
126+
file_url = "https://arangodb-dataset-library.s3.amazonaws.com/test_files/jsonl/vertex_collection/test_vertex.jsonl" # noqa: E501
127+
collection = db.create_collection("test_vertex")
128+
assert None == (
129+
Datasets.load_jsonl(
130+
Datasets(db),
131+
collection_name=collection_name,
132+
edge_type=edge_type,
133+
file_url=file_url,
134+
collection=collection,
135+
)
136+
)
137+
138+
139+
@no_type_check
140+
def jsonl_bad_url() -> None:
141+
cleanup_collections()
142+
collection_name = "test_vertex"
143+
edge_type = False
144+
collection = db.create_collection("test_vertex")
145+
with pytest.raises(ConnectionError):
146+
Datasets.load_jsonl(
147+
Datasets(db),
148+
collection_name=collection_name,
149+
edge_type=edge_type,
150+
file_url=bad_metadata_url,
151+
collection=collection,
152+
)
153+
154+
52155
@no_type_check
53156
def test_load() -> None:
54-
Datasets(db).load("FLIGHTS")
157+
cleanup_collections()
158+
Datasets(
159+
db,
160+
metadata_file=test_metadata_url,
161+
).load("TEST")
55162
with pytest.raises(Exception):
56163
Datasets(db).load(2)
57-
assert db.collection("airports").count() == 3375
58-
assert db.collection("flights").count() == 286463
164+
assert db.collection("test_vertex").count() == 2
165+
assert db.collection("test_edge").count() == 1

0 commit comments

Comments
 (0)