Skip to content

Commit ec5b1f6

Browse files
Restored local file system support
1 parent 3d8f890 commit ec5b1f6

File tree

4 files changed

+24
-25
lines changed

4 files changed

+24
-25
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ These are the main requirements for the SILNLP code to run on a local machine. S
7171
MINIO_ACCESS_KEY=xxxxxxxxx
7272
MINIO_SECRET_KEY=xxxxxxx
7373
```
74+
* Include SIL_NLP_DATA_PATH="/silnlp" if you are not using B2 or MinIO and will be storing files locally.
7475
* If you do not intend to use SILNLP with ClearML and/or B2/MinIO, you can leave out the respective variables. If you need to generate ClearML credentials, see [ClearML setup](clear_ml_setup.md).
7576
* Note that this does not give you direct access to a B2 or MinIO bucket from within the Docker container, it only allows you to run scripts referencing files in the bucket.
7677

@@ -143,6 +144,7 @@ These are the main requirements for the SILNLP code to run on a local machine. S
143144
MINIO_ACCESS_KEY=xxxxxxxxx
144145
MINIO_SECRET_KEY=xxxxxxx
145146
```
147+
* Include SIL_NLP_DATA_PATH="/silnlp" if you are not using B2 or MinIO and will be storing files locally.
146148
* If you need to generate ClearML credentials, see [ClearML setup](clear_ml_setup.md).
147149
* Note that this does not give you direct access to a B2 or MinIO bucket from within the Docker container, it only allows you to run scripts referencing files in the bucket.
148150
* For instructions on how to permanently set up environment variables for your operating system, see the corresponding section under the Development Environment Setup header below.

bucket_setup.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ We use Backblaze B2 and MinIO storage for storing our experiment data. Here is s
44

55
### Note For MinIO setup
66

7-
In order to access the MinIO bucket locally, you must have a VPN connected to its network.
7+
In order to access the MinIO bucket locally, you must have a VPN connected to its network. If you need VPN access, please reach out to an SILNLP dev team member.
88

99
### Install and configure rclone
1010

manual_setup.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ See [ClearML setup](clear_ml_setup.md).
8989

9090
### Additional Environment Variables
9191
* Set the following environment variables with your respective credentials: CLEARML_API_ACCESS_KEY, CLEARML_API_SECRET_KEY, B2_KEY_ID, B2_APPLICATION_KEY, MINIO_ACCESS_KEY, MINIO_SECRET_KEY.
92+
* Set SIL_NLP_DATA_PATH to "/silnlp" if you are not using B2 or MinIO and will be storing files locally.
9293
* Set CLEARML_API_HOST to "https://api.sil.hosted.allegro.ai".
9394
* Set B2_ENDPOINT_URL to https://s3.us-east-005.backblazeb2.com
9495
* Set MINIO_ENDPOINT_URL to https://truenas.psonet.languagetechnology.org:9000

silnlp/common/environment.py

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,11 @@ def __init__(self):
3030
self.is_bucket = False
3131
self.bucket_service = os.getenv("BUCKET_SERVICE", "").lower()
3232

33-
self.set_s3_bucket()
33+
self.set_data_dir()
3434

3535
def set_data_dir(self, data_dir: Optional[Path] = None):
36-
data_dir = self.resolve_data_dir(data_dir)
36+
if data_dir is None:
37+
data_dir = self.resolve_data_dir()
3738

3839
self.data_dir = pathify(data_dir)
3940

@@ -127,31 +128,28 @@ def set_alignment_dir(self, align_dir: Optional[Path] = None):
127128
self.align_gold_dir = self.align_dir / "gold"
128129
self.align_experiments_dir = self.align_dir / "experiments"
129130

130-
def resolve_data_dir(self, data_path) -> Path:
131+
def resolve_data_dir(self) -> Path:
131132
self.is_bucket = False
132-
if data_path != "":
133-
temp_path = Path(data_path)
133+
sil_nlp_data_path = os.getenv("SIL_NLP_DATA_PATH", default="")
134+
if sil_nlp_data_path != "" and self.bucket_service == "":
135+
temp_path = Path(sil_nlp_data_path)
134136
if temp_path.is_dir():
135-
LOGGER.info(f"Using workspace: {data_path} as per environment variable data_path.")
136-
return Path(data_path)
137+
LOGGER.info(f"Using workspace: {sil_nlp_data_path} as per environment variable SIL_NLP_DATA_PATH.")
138+
return Path(sil_nlp_data_path)
137139
else:
138-
temp_s3_path = S3Path(data_path)
139-
if temp_s3_path.is_dir():
140-
LOGGER.info(f"Using s3 workspace: {data_path}.")
141-
self.is_bucket = True
142-
return S3Path(data_path)
143-
else:
144-
raise Exception(
145-
f"The path defined by environment variable data_path ({data_path}) is not a "
146-
+ "real or s3 directory."
147-
)
140+
raise Exception(
141+
f"The path defined by environment variable SIL_NLP_DATA_PATH ({sil_nlp_data_path}) is not a "
142+
+ "real directory."
143+
)
148144

149145
gutenberg_path = Path("G:/Shared drives/Gutenberg")
150146
if gutenberg_path.is_dir():
151147
LOGGER.info(f"Using workspace: {gutenberg_path}.")
152148
return gutenberg_path
153149

154-
s3root = S3Path(data_path)
150+
self.set_s3_bucket()
151+
sil_nlp_data_path = f"/{self.bucket.name}"
152+
s3root = S3Path(sil_nlp_data_path)
155153
if s3root.is_dir():
156154
LOGGER.info(f"Using s3 workspace: {s3root}.")
157155
self.is_bucket = True
@@ -174,7 +172,6 @@ def set_resource(self, bucket_name: str, endpoint_url: str, access_key: str, sec
174172
# Tests the connection to the bucket. Delete is used because it fails fast and is free of api cost from Backblaze.
175173
bucket.delete_objects(Delete={"Objects": [{"Key": "conn_test_key"}]})
176174
register_configuration_parameter(PureS3Path("/"), resource=resource)
177-
self.set_data_dir(S3Path(f"/{bucket_name}"))
178175
self.bucket = bucket
179176

180177
def set_s3_bucket(self):
@@ -187,7 +184,6 @@ def set_s3_bucket(self):
187184
)
188185
bucket = resource.Bucket("silnlp")
189186
register_configuration_parameter(PureS3Path("/"), resource=resource)
190-
self.set_data_dir(S3Path(f"/silnlp"))
191187
self.bucket = bucket
192188
self.bucket_service = "aws"
193189
return
@@ -209,8 +205,8 @@ def set_s3_bucket(self):
209205
LOGGER.info("Connected to MINIO bucket.")
210206
self.bucket_service = "minio"
211207
except Exception as e:
212-
LOGGER.info(e)
213-
LOGGER.info("MINIO connection failed.")
208+
LOGGER.warning(e)
209+
LOGGER.warning("MINIO connection failed.")
214210
if self.bucket_service in ["", "b2"]:
215211
try:
216212
LOGGER.info("Trying to connect to B2 bucket.")
@@ -223,8 +219,8 @@ def set_s3_bucket(self):
223219
LOGGER.info("Connected to B2 bucket.")
224220
self.bucket_service = "b2"
225221
except Exception as e:
226-
LOGGER.info(e)
227-
LOGGER.info("B2 connection failed.")
222+
LOGGER.warning(e)
223+
LOGGER.warning("B2 connection failed.")
228224

229225
def copy_pt_project_from_bucket(self, name: Union[str, Path], patterns: Union[str, Sequence[str]] = []):
230226
if not self.is_bucket:

0 commit comments

Comments
 (0)