Skip to content

Commit

Permalink
Restored local file system support
Browse files Browse the repository at this point in the history
  • Loading branch information
TaperChipmunk32 committed Feb 11, 2025
1 parent 66b094b commit 833e27b
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 25 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ These are the main requirements for the SILNLP code to run on a local machine. S
MINIO_ACCESS_KEY=xxxxxxxxx
MINIO_SECRET_KEY=xxxxxxx
```
* Include SIL_NLP_DATA_PATH="/silnlp" if you are not using B2 or MinIO and will be storing files locally.
* If you do not intend to use SILNLP with ClearML and/or B2/MinIO, you can leave out the respective variables. If you need to generate ClearML credentials, see [ClearML setup](clear_ml_setup.md).
* Note that this does not give you direct access to a B2 or MinIO bucket from within the Docker container, it only allows you to run scripts referencing files in the bucket.

Expand Down Expand Up @@ -143,6 +144,7 @@ These are the main requirements for the SILNLP code to run on a local machine. S
MINIO_ACCESS_KEY=xxxxxxxxx
MINIO_SECRET_KEY=xxxxxxx
```
* Include SIL_NLP_DATA_PATH="/silnlp" if you are not using B2 or MinIO and will be storing files locally.
* If you need to generate ClearML credentials, see [ClearML setup](clear_ml_setup.md).
* Note that this does not give you direct access to a B2 or MinIO bucket from within the Docker container, it only allows you to run scripts referencing files in the bucket.
* For instructions on how to permanently set up environment variables for your operating system, see the corresponding section under the Development Environment Setup header below.
Expand Down
2 changes: 1 addition & 1 deletion bucket_setup.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ We use Backblaze B2 and MinIO storage for storing our experiment data. Here is s

### Note For MinIO setup

In order to access the MinIO bucket locally, you must have a VPN connected to its network.
In order to access the MinIO bucket locally, you must have a VPN connected to its network. If you need VPN access, please reach out to an SILNLP dev team member.

### Install and configure rclone

Expand Down
1 change: 1 addition & 0 deletions manual_setup.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ See [ClearML setup](clear_ml_setup.md).

### Additional Environment Variables
* Set the following environment variables with your respective credentials: CLEARML_API_ACCESS_KEY, CLEARML_API_SECRET_KEY, B2_KEY_ID, B2_APPLICATION_KEY, MINIO_ACCESS_KEY, MINIO_SECRET_KEY.
* Set SIL_NLP_DATA_PATH to "/silnlp" if you are not using B2 or MinIO and will be storing files locally.
* Set CLEARML_API_HOST to "https://api.sil.hosted.allegro.ai".
* Set B2_ENDPOINT_URL to https://s3.us-east-005.backblazeb2.com
* Set MINIO_ENDPOINT_URL to https://truenas.psonet.languagetechnology.org:9000
Expand Down
44 changes: 20 additions & 24 deletions silnlp/common/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,11 @@ def __init__(self):
self.is_bucket = False
self.bucket_service = os.getenv("BUCKET_SERVICE", "").lower()

self.set_s3_bucket()
self.set_data_dir()

def set_data_dir(self, data_dir: Optional[Path] = None):
data_dir = self.resolve_data_dir(data_dir)
if data_dir is None:
data_dir = self.resolve_data_dir()

self.data_dir = pathify(data_dir)

Expand Down Expand Up @@ -127,31 +128,28 @@ def set_alignment_dir(self, align_dir: Optional[Path] = None):
self.align_gold_dir = self.align_dir / "gold"
self.align_experiments_dir = self.align_dir / "experiments"

def resolve_data_dir(self, data_path) -> Path:
def resolve_data_dir(self) -> Path:
self.is_bucket = False
if data_path != "":
temp_path = Path(data_path)
sil_nlp_data_path = os.getenv("SIL_NLP_DATA_PATH", default="")
if sil_nlp_data_path != "" and self.bucket_service == "":
temp_path = Path(sil_nlp_data_path)
if temp_path.is_dir():
LOGGER.info(f"Using workspace: {data_path} as per environment variable data_path.")
return Path(data_path)
LOGGER.info(f"Using workspace: {sil_nlp_data_path} as per environment variable SIL_NLP_DATA_PATH.")
return Path(sil_nlp_data_path)
else:
temp_s3_path = S3Path(data_path)
if temp_s3_path.is_dir():
LOGGER.info(f"Using s3 workspace: {data_path}.")
self.is_bucket = True
return S3Path(data_path)
else:
raise Exception(
f"The path defined by environment variable data_path ({data_path}) is not a "
+ "real or s3 directory."
)
raise Exception(
f"The path defined by environment variable SIL_NLP_DATA_PATH ({sil_nlp_data_path}) is not a "
+ "real directory."
)

gutenberg_path = Path("G:/Shared drives/Gutenberg")
if gutenberg_path.is_dir():
LOGGER.info(f"Using workspace: {gutenberg_path}.")
return gutenberg_path

s3root = S3Path(data_path)
self.set_s3_bucket()
sil_nlp_data_path = f"/{self.bucket.name}"
s3root = S3Path(sil_nlp_data_path)
if s3root.is_dir():
LOGGER.info(f"Using s3 workspace: {s3root}.")
self.is_bucket = True
Expand All @@ -174,7 +172,6 @@ def set_resource(self, bucket_name: str, endpoint_url: str, access_key: str, sec
# Tests the connection to the bucket. Delete is used because it fails fast and is free of api cost from Backblaze.
bucket.delete_objects(Delete={"Objects": [{"Key": "conn_test_key"}]})
register_configuration_parameter(PureS3Path("/"), resource=resource)
self.set_data_dir(S3Path(f"/{bucket_name}"))
self.bucket = bucket

def set_s3_bucket(self):
Expand All @@ -187,7 +184,6 @@ def set_s3_bucket(self):
)
bucket = resource.Bucket("silnlp")
register_configuration_parameter(PureS3Path("/"), resource=resource)
self.set_data_dir(S3Path(f"/silnlp"))
self.bucket = bucket
self.bucket_service = "aws"
return
Expand All @@ -209,8 +205,8 @@ def set_s3_bucket(self):
LOGGER.info("Connected to MINIO bucket.")
self.bucket_service = "minio"
except Exception as e:
LOGGER.info(e)
LOGGER.info("MINIO connection failed.")
LOGGER.warning(e)
LOGGER.warning("MINIO connection failed.")
if self.bucket_service in ["", "b2"]:
try:
LOGGER.info("Trying to connect to B2 bucket.")
Expand All @@ -223,8 +219,8 @@ def set_s3_bucket(self):
LOGGER.info("Connected to B2 bucket.")
self.bucket_service = "b2"
except Exception as e:
LOGGER.info(e)
LOGGER.info("B2 connection failed.")
LOGGER.warning(e)
LOGGER.warning("B2 connection failed.")

def copy_pt_project_from_bucket(self, name: Union[str, Path], patterns: Union[str, Sequence[str]] = []):
if not self.is_bucket:
Expand Down

0 comments on commit 833e27b

Please sign in to comment.