Merge pull request #33 from DenisaCG/maxNoFiles

DenisaCG · web-flow · commit 2dba03b5e9b9 · 2024-12-06T17:34:14.000+01:00
Add configurable limit for number of objects listed in `DriveBrowser`
diff --git a/README.md b/README.md
@@ -38,7 +38,8 @@ c = get_config()
 
 c.DrivesConfig.access_key_id = "<Drives Access Key ID / IAM Access Key ID>"
 c.DrivesConfig.secret_access_key = "<Drives Secret Access Key / IAM Secret>"
-c.DrivesConfig.session_token = "<Drives Session Token / IAM Session Token>"
+c.DrivesConfig.session_token = "<Drives Session Token / IAM Session Token (optional)>"
+c.DrivesConfig.provider = "<Drives provider e.g.: s3, gcs>"
 ```
 
 ### Custom credentials file path
diff --git a/jupyter_drives/base.py b/jupyter_drives/base.py
@@ -80,7 +80,7 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self._load_credentials()
     
-    def _load_credentials(self):
+    def _load_credentials(self):        
         # check if credentials were already set in jupyter_notebook_config.py
         if self.access_key_id is not None and self.secret_access_key is not None:
             return
@@ -114,7 +114,7 @@ def _extract_credentials_from_file(self, file_path):
                 provider, access_key_id, secret_access_key, session_token = None, None, None, None
                 lines = file.readlines()
                 for line in lines:
-                    if line.startswith("provider ="):
+                    if line.startswith("drives_provider ="):
                         provider = line.split("=")[1].strip()
                     elif line.startswith("drives_access_key_id ="):
                         access_key_id = line.split("=")[1].strip()
diff --git a/jupyter_drives/handlers.py b/jupyter_drives/handlers.py
@@ -42,6 +42,19 @@ def write_error(self, status_code, **kwargs):
                 reply["error"] = "".join(traceback.format_exception(*exc_info))
         self.finish(json.dumps(reply))
 
+class ConfigJupyterDrivesHandler(JupyterDrivesAPIHandler):
+    """
+    Set certain configuration variables in drives manager.
+    """
+    def initialize(self, logger: logging.Logger, manager: JupyterDrivesManager):
+        return super().initialize(logger, manager)
+    
+    @tornado.web.authenticated
+    async def post(self):
+        body = self.get_json_body()
+        result = self._manager.set_listing_limit(**body)
+        self.finish(result)
+
 class ListJupyterDrivesHandler(JupyterDrivesAPIHandler):
     """
     List available drives. Mounts drives.
@@ -106,7 +119,8 @@ async def head(self, drive: str = "", path: str = ""):
         self.finish(result)
 
 handlers = [
-    ("drives", ListJupyterDrivesHandler)
+    ("drives", ListJupyterDrivesHandler),
+    ("drives/config", ConfigJupyterDrivesHandler),
 ]
 
 handlers_with_path = [
diff --git a/jupyter_drives/manager.py b/jupyter_drives/manager.py
@@ -40,12 +40,16 @@ def __init__(self, config: traitlets.config.Config) -> None:
         self._config = DrivesConfig(config=config)
         self._client = httpx.AsyncClient()
         self._content_managers = {}
+        self._max_files_listed = 1000
 
          # initiate boto3 session if we are dealing with S3 drives
         if self._config.provider == 's3':
             self._s3_clients = {}
             if self._config.access_key_id and self._config.secret_access_key:
-                self._s3_session = boto3.Session(aws_access_key_id = self._config.access_key_id, aws_secret_access_key = self._config.secret_access_key)
+                if self._config.session_token is None:
+                    self._s3_session = boto3.Session(aws_access_key_id = self._config.access_key_id, aws_secret_access_key = self._config.secret_access_key)
+                else:
+                    self._s3_session = boto3.Session(aws_access_key_id = self._config.access_key_id, aws_secret_access_key = self._config.secret_access_key, aws_session_token = self._config.session_token)
             else:
                 raise tornado.web.HTTPError(
                 status_code= httpx.codes.BAD_REQUEST,
@@ -71,6 +75,22 @@ def per_page_argument(self) -> Optional[Tuple[str, int]]:
         """
         return ("per_page", 100)
     
+    def set_listing_limit(self, new_limit):
+        """Set new limit for listing.
+
+        Args:
+            new_limit: new maximum to be set
+        """
+        try:
+            self._max_files_listed = new_limit
+        except Exception as e:
+            raise tornado.web.HTTPError(
+            status_code= httpx.codes.BAD_REQUEST,
+            reason= f"The following error occured when setting the new listing limit: {e}"
+            )
+
+        return
+    
     async def list_drives(self): 
         """Get list of available drives.
 
@@ -126,15 +146,25 @@ async def mount_drive(self, drive_name, provider, region):
 
         Args:
             drive_name: name of drive to mount
-
-        Returns:
-            The content manager for the drive.
         """
         try: 
             # check if content manager doesn't already exist
             if drive_name not in self._content_managers or self._content_managers[drive_name] is None:
                 if provider == 's3':
-                    store = obs.store.S3Store.from_url("s3://" + drive_name + "/", config = {"aws_access_key_id": self._config.access_key_id, "aws_secret_access_key": self._config.secret_access_key, "aws_region": region})
+                    if self._config.session_token is None:
+                        configuration = {
+                            "aws_access_key_id": self._config.access_key_id,
+                            "aws_secret_access_key": self._config.secret_access_key,
+                            "aws_region": region
+                            }
+                    else:
+                        configuration = {
+                            "aws_access_key_id": self._config.access_key_id,
+                            "aws_secret_access_key": self._config.secret_access_key,
+                            "aws_session_token": self._config.session_token,
+                            "aws_region": region
+                            }
+                    store = obs.store.S3Store.from_url("s3://" + drive_name + "/", config = configuration)
                 elif provider == 'gcs':
                     store = obs.store.GCSStore.from_url("gs://" + drive_name + "/", config = {}) # add gcs config
                 elif provider == 'http':
@@ -193,23 +223,43 @@ async def get_contents(self, drive_name, path):
             isDir = False
             emptyDir = True # assume we are dealing with an empty directory
 
+            chunk_size = 100
+            if self._max_files_listed < chunk_size:
+                chunk_size = self._max_files_listed
+            no_batches = int(self._max_files_listed/chunk_size)
+
             # using Arrow lists as they are recommended for large results
             # stream will be an async iterable of RecordBatch
-            stream = obs.list(self._content_managers[drive_name]["store"], path, chunk_size=100, return_arrow=True)
+            current_batch = 0
+            stream = obs.list(self._content_managers[drive_name]["store"], path, chunk_size=chunk_size, return_arrow=True)
             async for batch in stream:
+                current_batch += 1
+                # reached last batch that can be shown (partially)
+                if current_batch == no_batches + 1:
+                    remaining_files = self._max_files_listed - no_batches*chunk_size
+                    
                 # if content exists we are dealing with a directory
                 if isDir is False and batch: 
                     isDir = True
                     emptyDir = False
                     
                 contents_list = pyarrow.record_batch(batch).to_pylist()
                 for object in contents_list:
+                    # when listing the last batch (partially), make sure we don't exceed limit
+                    if current_batch == no_batches + 1:
+                        if remaining_files <= 0:
+                            break
+                        remaining_files -= 1
                     data.append({
                         "path": object["path"],
                         "last_modified": object["last_modified"].isoformat(),
                         "size": object["size"],
                     })
                 
+                # check if we reached the limit of files that can be listed
+                if current_batch == no_batches + 1:
+                    break
+                
             # check if we are dealing with an empty drive
             if isDir is False and path != '':
                 content = b""
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,7 +22,8 @@ classifiers = [
     "Programming Language :: Python :: 3.11",
 ]
 dependencies = [
-    "obstore>=0.2.0,<0.3",
+    "obstore>=0.3.0b,<0.4",
+    "arro3-core>=0.2.1,<0.3",
     "pyarrow>=18.0.0,<19.0.0",
     "jupyter_server>=2.14.2,<3",
     "s3contents>=0.11.1,<0.12.0",
diff --git a/schema/drives-file-browser.json b/schema/drives-file-browser.json
@@ -1,5 +1,5 @@
 {
-  "title": "Jupyter Drives Settings",
+  "title": "Drives Browser Settings",
   "description": "jupyter-drives settings.",
   "jupyter.lab.toolbars": {
     "DriveBrowser": [
@@ -40,6 +40,12 @@
   "type": "object",
   "jupyter.lab.transform": true,
   "properties": {
+    "maxFilesListed": {
+      "type": "integer",
+      "title": "Maximum number of objects listed",
+      "description": "Configure maximum number of objects that will be shown in a listing, given any path.",
+      "default": 1000
+    },
     "toolbar": {
       "title": "Drive browser toolbar items",
       "description": "Note: To disable a toolbar item,\ncopy it to User Preferences and add the\n\"disabled\" key.",
diff --git a/src/contents.ts b/src/contents.ts
@@ -45,7 +45,6 @@ export class Drive implements Contents.IDrive {
     this._serverSettings = ServerConnection.makeSettings();
     this._name = options.name ?? '';
     this._drivesList = options.drivesList ?? [];
-    //this._apiEndpoint = options.apiEndpoint ?? SERVICE_DRIVE_URL;
   }
 
   /**
@@ -719,16 +718,6 @@ export class Drive implements Contents.IDrive {
     }
   }
 
-  /**
-   * Get a REST url for a file given a path.
-   */
-  /*private _getUrl(...args: string[]): string {
-    const parts = args.map(path => URLExt.encodeParts(path));
-    const baseUrl = this.serverSettings.baseUrl;
-    return URLExt.join(baseUrl, this._apiEndpoint, ...parts);
-  }*/
-
-  // private _apiEndpoint: string;
   private _drivesList: IDriveInfo[] = [];
   private _serverSettings: ServerConnection.ISettings;
   private _name: string = '';
@@ -771,17 +760,3 @@ export namespace Drive {
     apiEndpoint?: string;
   }
 }
-
-/*namespace Private {
-  /**
-   * Normalize a file extension to be of the type `'.foo'`.
-   *
-   * Adds a leading dot if not present and converts to lower case.
-   */
-/*export function normalizeExtension(extension: string): string {
-    if (extension.length > 0 && extension.indexOf('.') !== 0) {
-      extension = `.${extension}`;
-    }
-    return extension;
-  }
-}*/
diff --git a/src/index.ts b/src/index.ts
@@ -26,7 +26,7 @@ import { CommandRegistry } from '@lumino/commands';
 import { DriveListModel, DriveListView, IDrive } from './drivelistmanager';
 import { DriveIcon, driveBrowserIcon } from './icons';
 import { Drive } from './contents';
-import { getDrivesList } from './requests';
+import { getDrivesList, setListingLimit } from './requests';
 import { IDriveInfo, IDrivesList } from './token';
 
 /**
@@ -288,6 +288,34 @@ const driveFileBrowser: JupyterFrontEndPlugin<void> = {
         translator
       )
     );
+
+    /**
+     * Load the settings for this extension
+     *
+     * @param setting Extension settings
+     */
+    function loadSetting(setting: ISettingRegistry.ISettings): void {
+      // Read the settings and convert to the correct type
+      const maxFilesListed = setting.get('maxFilesListed').composite as number;
+      // Set new limit.
+      setListingLimit(maxFilesListed);
+    }
+
+    // Wait for the application to be restored and
+    // for the settings for this plugin to be loaded
+    Promise.all([app.restored, settingsRegistry.load(driveFileBrowser.id)])
+      .then(([, setting]) => {
+        // Read the settings
+        loadSetting(setting);
+
+        // Listen for your plugin setting changes using Signal
+        setting.changed.connect(loadSetting);
+      })
+      .catch(reason => {
+        console.error(
+          `Something went wrong when reading the settings.\n${reason}`
+        );
+      });
   }
 };
 
diff --git a/src/requests.ts b/src/requests.ts
@@ -21,6 +21,17 @@ let data: Contents.IModel = {
   type: ''
 };
 
+/**
+ * Set new limit for number of objects to be listed inside the DriveBrowser, given any path.
+ *
+ * @returns
+ */
+export async function setListingLimit(newLimit: number) {
+  await requestAPI<any>('drives/config', 'POST', {
+    new_limit: newLimit
+  });
+}
+
 /**
  * Fetch the list of available drives.
  * @returns The list of available drives.