-
Notifications
You must be signed in to change notification settings - Fork 65
Google Cloud Storage Integration #683
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import time | ||
from typing import IO, Callable, Iterable, Optional, Sequence | ||
|
||
import smart_open | ||
from google.cloud import storage | ||
from model_engine_server.core.loggers import logger_name, make_logger | ||
|
||
logger = make_logger(logger_name()) | ||
|
||
__all__: Sequence[str] = ( | ||
"sync_storage_client", | ||
# `open` should be used, but so as to not shadow the built-in, the preferred import is: | ||
# >>> storage_client.open | ||
# Thus, it's not included in the wildcard imports. | ||
"sync_storage_client_keepalive", | ||
"gcs_fileobj_exists", | ||
) | ||
|
||
|
||
def sync_storage_client(**kwargs) -> storage.Client: | ||
return storage.Client(**kwargs) | ||
|
||
|
||
def open(uri: str, mode: str = "rt", **kwargs) -> IO: # pylint: disable=redefined-builtin | ||
if "transport_params" not in kwargs: | ||
kwargs["transport_params"] = {"client": sync_storage_client()} | ||
return smart_open.open(uri, mode, **kwargs) | ||
|
||
|
||
def sync_storage_client_keepalive( | ||
gcp_client: storage.Client, | ||
buckets: Iterable[str], | ||
interval: int, | ||
is_cancelled: Callable[[], bool], | ||
) -> None: | ||
"""Keeps connection pool warmed up for access on list of GCP buckets. | ||
|
||
NOTE: :param:`is_cancelled` **MUST BE THREADSAFE**. | ||
""" | ||
while True: | ||
if is_cancelled(): | ||
logger.info("Ending GCP client keepalive: cancel invoked.") | ||
return | ||
for bucket in buckets: | ||
try: | ||
# Instead of head_bucket, for GCP we obtain the bucket object and reload it. | ||
bucket_obj = gcp_client.bucket(bucket) | ||
bucket_obj.reload() # refreshes metadata and validates connectivity | ||
except Exception: # pylint:disable=broad-except | ||
logger.exception( | ||
f"Unexpected error in keepalive loop on accessing bucket: {bucket}" | ||
) | ||
time.sleep(interval) | ||
|
||
|
||
def gcs_fileobj_exists(bucket: str, key: str, client: Optional[storage.Client] = None) -> bool: | ||
""" | ||
Test if file exists in GCP storage. | ||
:param bucket: GCP bucket name | ||
:param key: Blob name or file's path within the bucket | ||
:param client: A google.cloud.storage.Client instance | ||
:return: Whether the file exists on GCP or not | ||
""" | ||
if client is None: | ||
client = sync_storage_client() | ||
try: | ||
bucket_obj = client.bucket(bucket) | ||
# get_blob returns None if the blob does not exist. | ||
blob = bucket_obj.get_blob(key) | ||
except Exception as e: | ||
logger.exception(f"Error checking file existence in bucket {bucket} for key {key}") | ||
raise e | ||
else: | ||
return blob is not None |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
import os | ||
from typing import List, Optional | ||
|
||
from google.cloud import storage | ||
|
||
from model_engine_server.core.config import infra_config | ||
from model_engine_server.domain.gateways.file_storage_gateway import ( | ||
FileMetadata, | ||
FileStorageGateway, | ||
) | ||
from model_engine_server.infra.gateways.gcs_filesystem_gateway import GCSFilesystemGateway | ||
|
||
|
||
def get_gcs_key(owner: str, file_id: str) -> str: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I'd prefix these w/ an underscore so that no one is tempted to try and import these from outside this file, thus breaking Clean Architecture norms. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. btw I think the s3_file_storage_gateway also doesn't have the prefixes |
||
""" | ||
Constructs a GCS object key from the owner and file_id. | ||
""" | ||
return os.path.join(owner, file_id) | ||
|
||
|
||
def get_gcs_url(owner: str, file_id: str) -> str: | ||
""" | ||
Returns the gs:// URL for the bucket, using the GCS key. | ||
""" | ||
return f"gs://{infra_config().gcs_bucket}/{get_gcs_key(owner, file_id)}" | ||
|
||
|
||
class GCSFileStorageGateway(FileStorageGateway): | ||
""" | ||
Concrete implementation of a file storage gateway backed by GCS. | ||
""" | ||
|
||
def __init__(self): | ||
self.filesystem_gateway = GCSFilesystemGateway() | ||
|
||
async def get_url_from_id(self, owner: str, file_id: str) -> Optional[str]: | ||
""" | ||
Returns a signed GCS URL for the given file. | ||
""" | ||
try: | ||
return self.filesystem_gateway.generate_signed_url(get_gcs_url(owner, file_id)) | ||
except Exception: | ||
return None | ||
|
||
async def get_file(self, owner: str, file_id: str) -> Optional[FileMetadata]: | ||
""" | ||
Retrieves file metadata if it exists. Returns None if the file is missing. | ||
""" | ||
try: | ||
client = self.filesystem_gateway.get_storage_client({}) | ||
bucket = client.bucket(infra_config().gcs_bucket) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know this pattern was already there, but I think it'd probably make more sense to pass in the bucket into the constructor of this class. This way, there's one less dependency on the old Could also make the argument to just pass in the bucket as an argument with every There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm fine with having the bucket be passed in the constructor (in addition to anything else from any configs); |
||
blob = bucket.blob(get_gcs_key(owner, file_id)) | ||
blob.reload() # Fetch metadata | ||
return FileMetadata( | ||
id=file_id, | ||
filename=file_id, | ||
size=blob.size, | ||
owner=owner, | ||
updated_at=blob.updated, | ||
) | ||
except Exception: | ||
return None | ||
|
||
async def get_file_content(self, owner: str, file_id: str) -> Optional[str]: | ||
""" | ||
Reads and returns the string content of the file. | ||
""" | ||
try: | ||
with self.filesystem_gateway.open(get_gcs_url(owner, file_id)) as f: | ||
return f.read() | ||
except Exception: | ||
return None | ||
|
||
async def upload_file(self, owner: str, filename: str, content: bytes) -> str: | ||
""" | ||
Uploads the file to the GCS bucket. Returns the filename used in bucket. | ||
""" | ||
with self.filesystem_gateway.open( | ||
get_gcs_url(owner, filename), mode="w" | ||
) as f: | ||
f.write(content.decode("utf-8")) | ||
return filename | ||
|
||
async def delete_file(self, owner: str, file_id: str) -> bool: | ||
""" | ||
Deletes the file from the GCS bucket. Returns True if successful, False otherwise. | ||
""" | ||
try: | ||
client = self.filesystem_gateway.get_storage_client({}) | ||
bucket = client.bucket(infra_config().gcs_bucket) | ||
blob = bucket.blob(get_gcs_key(owner, file_id)) | ||
blob.delete() | ||
return True | ||
except Exception: | ||
return False | ||
|
||
async def list_files(self, owner: str) -> List[FileMetadata]: | ||
""" | ||
Lists all files in the GCS bucket for the given owner. | ||
""" | ||
client = self.filesystem_gateway.get_storage_client({}) | ||
blobs = client.list_blobs(infra_config().gcs_bucket, prefix=owner) | ||
files = [await self.get_file(owner, b.name[len(owner) + 1 :]) for b in blobs if b.name != owner] | ||
return [f for f in files if f is not None] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import os | ||
import re | ||
from typing import IO, Optional, Dict | ||
|
||
import smart_open | ||
from google.cloud import storage | ||
from model_engine_server.infra.gateways.filesystem_gateway import FilesystemGateway | ||
|
||
|
||
class GCSFilesystemGateway(FilesystemGateway): | ||
""" | ||
Concrete implementation for interacting with Google Cloud Storage. | ||
""" | ||
|
||
def get_storage_client(self, kwargs: Optional[Dict]) -> storage.Client: | ||
""" | ||
Retrieve or create a Google Cloud Storage client. Could optionally | ||
utilize environment variables or passed-in credentials. | ||
""" | ||
project = kwargs.get("gcp_project", os.getenv("GCP_PROJECT")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. where does this env var get set? it seems analogous to |
||
return storage.Client(project=project) | ||
|
||
def open(self, uri: str, mode: str = "rt", **kwargs) -> IO: | ||
""" | ||
Uses smart_open to handle reading/writing to GCS. | ||
""" | ||
# The `transport_params` is how smart_open passes in the storage client | ||
client = self.get_storage_client(kwargs) | ||
transport_params = {"client": client} | ||
return smart_open.open(uri, mode, transport_params=transport_params) | ||
|
||
def generate_signed_url(self, uri: str, expiration: int = 3600, **kwargs) -> str: | ||
""" | ||
Generate a signed URL for the given GCS URI, valid for `expiration` seconds. | ||
""" | ||
# Expecting URIs in the form: 'gs://bucket_name/some_key' | ||
match = re.search(r"^gs://([^/]+)/(.+)$", uri) | ||
if not match: | ||
raise ValueError(f"Invalid GCS URI: {uri}") | ||
|
||
bucket_name, blob_name = match.groups() | ||
client = self.get_storage_client(kwargs) | ||
bucket = client.bucket(bucket_name) | ||
blob = bucket.blob(blob_name) | ||
|
||
return blob.generate_signed_url(expiration=expiration) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
note: I think this is only really used for some fine tuning apis that aren't really used at this point, think it's fine to keep ofc since you'll probably need to initialize dependencies anyways, but this code probably won't really get exercised at all