Skip to content

Commit 281fbfd

Browse files
committed
chore: fetch lfs files on demand
1 parent 1323d39 commit 281fbfd

6 files changed

Lines changed: 105 additions & 182 deletions

File tree

README.md

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,31 @@ _Masonry MicroStructure Database_
44

55
## Requirements
66

7-
- Git LFS
87
- [uv](https://docs.astral.sh/uv/getting-started/installation/) Python package and project manager
98
- [npm](https://docs.npmjs.com/) Node.js package manager
9+
- Git LFS
1010
- Make
1111

1212

13-
## Getting the data
13+
## Cloning the repository
14+
15+
### Without large files
16+
17+
To clone the repository without getting the large files, run:
18+
19+
```bash
20+
export GIT_LFS_SKIP_SMUDGE=1
21+
git clone <repository_url>
22+
```
23+
24+
When running subsequent git commands (`checkout`, `pull`, ...), make sure to keep the `GIT_LFS_SKIP_SMUDGE` variable set to `1` in your shell environment to prevent large files from getting downloaded.
25+
1426

15-
After cloning the repository, make sure that you have Git LFS installed and that you have pulled the large files:
27+
### With large files
28+
29+
You need to pull large files if you intend to contribute new wall microstructure data.
30+
Make sure that you have Git LFS installed. Large files will be downloaded when cloning the repository.
31+
If you have already cloned the repository and want to fetch the large files, run:
1632

1733
```bash
1834
git config --global credential.helper store
@@ -81,7 +97,12 @@ Once the changes have been validated on the development server, they can be merg
8197

8298
## Deploying the website locally
8399

84-
Running the MMS Database website locally is a great way to test changes before pushing them to the main repository. Setup your environment by running:
100+
Running the MMS Database website locally is a great way to test changes before pushing them to the main repository. This can also work if large files weren't cloned locally, in which case they will be downloaded on demand when running the backend.
101+
102+
> [!NOTE]
103+
> If LFS data was not cloned, you need to set the variables `LFS_USERNAME` and `LFS_PASSWORD` in a `.env` file in the root directory of the repository.
104+
105+
Setup your environment by running:
85106

86107
```bash
87108
make install

backend/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
FROM python:3.13.6-bookworm
2-
ENV UV_VERSION=0.8.9
2+
ENV UV_VERSION=0.9.13
33

4-
RUN apt-get update && apt-get install -y git-lfs && git lfs install && rm -rf /var/lib/apt/lists/*
54
RUN pip install "uv==$UV_VERSION"
65

76
WORKDIR /app
87

98
COPY uv.lock pyproject.toml /app/
109
COPY start.sh /app/
1110
COPY api /app/api
11+
COPY ../data /data
1212

1313
RUN uv pip install --system --no-cache-dir -e /app
1414

backend/api/config.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from functools import lru_cache
23

34
from pydantic_settings import BaseSettings
@@ -10,9 +11,8 @@ class Config(BaseSettings):
1011
LFS_USERNAME: str = ""
1112
LFS_PASSWORD: str = ""
1213
LFS_REPO_URL: str = "https://github.com/EPFL-ENAC/eesd-mmsdb.git"
13-
LFS_SERVER_URL: str = ""
14-
LFS_GIT_REF: str = ""
15-
LFS_CLONED_REPO_PATH: str = ".."
14+
LFS_SERVER_URL: str = "https://enac-it-git-lfs.epfl.ch/api/epfl-enac/eesd-mmsdb"
15+
DATA_PATH: str = os.path.join("..", "data")
1616

1717
UPLOAD_FILES_PATH: str = "/tmp/mmsdb_upload"
1818
UPLOAD_FILES_SUFFIX: str = ".ply,.obj,.stl"

backend/api/services/files.py

Lines changed: 28 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,16 @@
11
import json
22
import mimetypes
3-
import multiprocessing
43
import os
54
import shutil
65
import subprocess
76
from datetime import datetime
8-
from functools import cache as functools_cache
7+
from functools import cache
98
from logging import getLogger
109
from pathlib import Path
1110

1211
from api.config import config
1312
from api.models.files import Contribution, FileInfo, UploadInfo
14-
from fastapi import UploadFile
13+
from fastapi import HTTPException, UploadFile
1514

1615
logger = getLogger("uvicorn.error")
1716

@@ -21,132 +20,44 @@
2120
]
2221

2322

24-
def cleanup_git_lock(repo_path: str):
25-
"""Kill other git processes and remove .git/index.lock if exists."""
26-
try:
27-
subprocess.run(["pkill", "-f", "git"], check=False)
28-
except Exception:
29-
pass
30-
31-
git_lock = Path(repo_path) / ".git" / "index.lock"
32-
if git_lock.exists():
33-
try:
34-
git_lock.unlink()
35-
logger.info(f"Removed git lock file: {git_lock}")
36-
except Exception as e:
37-
logger.warning(f"Failed to remove git lock file: {e}")
38-
39-
40-
def cmd(command: str, working_directory: str | None = None) -> bytes:
41-
"""Run a shell command with real-time logging."""
42-
43-
logger.info(f"Running command: {command}")
44-
45-
process = subprocess.Popen(
46-
command.split(),
47-
stdout=subprocess.PIPE,
48-
stderr=subprocess.PIPE,
49-
cwd=working_directory,
50-
text=True,
51-
bufsize=1,
52-
universal_newlines=True,
53-
)
54-
55-
stdout_lines = []
56-
stderr_lines = []
57-
58-
while True:
59-
stdout_line = process.stdout.readline() if process.stdout else None
60-
stderr_line = process.stderr.readline() if process.stderr else None
61-
62-
if stdout_line:
63-
logger.info(f"STDOUT: {stdout_line.rstrip()}")
64-
stdout_lines.append(stdout_line)
65-
66-
if stderr_line:
67-
logger.error(f"STDERR: {stderr_line.rstrip()}")
68-
stderr_lines.append(stderr_line)
69-
70-
if process.poll() is not None:
71-
break
72-
73-
remaining_stdout, remaining_stderr = process.communicate()
74-
if remaining_stdout:
75-
for line in remaining_stdout.splitlines():
76-
if line.strip():
77-
logger.info(f"STDOUT: {line}")
78-
stdout_lines.append(line + "\n")
79-
80-
if remaining_stderr:
81-
for line in remaining_stderr.splitlines():
82-
if line.strip():
83-
logger.error(f"STDERR: {line}")
84-
stderr_lines.append(line + "\n")
23+
@cache
24+
def get_local_file_lfs_id(file_path: Path) -> str | None:
25+
"""Check if a local file is a Git LFS pointer file."""
26+
if not file_path.exists():
27+
return None
8528

86-
return_code = process.returncode
29+
try:
30+
with open(file_path, "r") as f:
31+
first_line = f.readline().strip()
32+
if first_line != "version https://git-lfs.github.com/spec/v1":
33+
return None
8734

88-
if return_code != 0:
89-
stderr_output = "".join(stderr_lines)
90-
raise Exception(f"Command failed: {command}\n{stderr_output}")
35+
second_line = f.readline().strip()
36+
if not second_line.startswith("oid sha256:"):
37+
return None
9138

92-
stdout_output = "".join(stdout_lines)
93-
return stdout_output.strip().encode("utf-8")
39+
return second_line.split(":")[1]
9440

41+
except Exception:
42+
return None
9543

96-
def init_lfs_data():
97-
"""Initialize LFS data by cloning the repository if not already done and checking out the specified git ref."""
9844

99-
if not config.LFS_GIT_REF:
100-
logger.info("LFS_GIT_REF is not set. Using local data.")
101-
return
45+
@cache
46+
def get_lfs_url(oid: str) -> str:
47+
"""Get the download URL for a Git LFS object ID."""
48+
if not config.LFS_USERNAME or not config.LFS_PASSWORD:
49+
raise HTTPException(
50+
status_code=401, detail="LFS credentials are not configured"
51+
)
10252

103-
lfs_server_url = config.LFS_SERVER_URL.replace(
53+
url = config.LFS_SERVER_URL.replace(
10454
"https://", f"https://{config.LFS_USERNAME}:{config.LFS_PASSWORD}@"
10555
)
106-
credentials_line = f"{lfs_server_url}\n"
107-
git_credentials_path = Path.home() / ".git-credentials"
108-
with open(git_credentials_path, "a") as f:
109-
f.write(credentials_line)
110-
cmd("git config --global credential.helper store")
111-
112-
if not os.path.exists(config.LFS_CLONED_REPO_PATH):
113-
logger.info("Creating parent directories for LFS repository clone...")
114-
os.makedirs(config.LFS_CLONED_REPO_PATH, exist_ok=True)
115-
logger.info("Cloning LFS repository...")
116-
cmd(f"git clone {config.LFS_REPO_URL} {config.LFS_CLONED_REPO_PATH}")
117-
cmd(
118-
f"git checkout {config.LFS_GIT_REF}",
119-
working_directory=config.LFS_CLONED_REPO_PATH,
120-
)
121-
cmd("git lfs pull", working_directory=config.LFS_CLONED_REPO_PATH)
122-
123-
else:
124-
logger.info(
125-
"LFS repository already cloned. Checking out the specified git ref and pulling..."
126-
)
127-
cmd("git reset --hard", working_directory=config.LFS_CLONED_REPO_PATH)
128-
cmd("git clean -fdx", working_directory=config.LFS_CLONED_REPO_PATH)
129-
cmd(
130-
f"git checkout {config.LFS_GIT_REF}",
131-
working_directory=config.LFS_CLONED_REPO_PATH,
132-
)
133-
cmd("git pull", working_directory=config.LFS_CLONED_REPO_PATH)
134-
cmd("git lfs pull", working_directory=config.LFS_CLONED_REPO_PATH)
13556

136-
logger.info("LFS data initialized.")
57+
return f"{url}/object/{oid}"
13758

13859

139-
def _init_lfs_data_wrapper():
140-
try:
141-
init_lfs_data()
142-
except Exception as e:
143-
logger.error(f"Failed to initialize LFS data (subprocess): {e}")
144-
logger.warning("Continuing without up to date LFS data (subprocess).")
145-
finally:
146-
cleanup_git_lock(config.LFS_CLONED_REPO_PATH)
147-
148-
149-
@functools_cache
60+
@cache
15061
def get_local_file_content(file_path: Path) -> tuple[bytes | None, str | None]:
15162
"""Read file content and determine MIME type."""
15263
if not file_path.exists():
@@ -314,8 +225,3 @@ def update_local_upload_info_state(relative_path: str, state: str) -> None:
314225
raise ValueError(f"Failed to update state in info file: {e}")
315226

316227
return
317-
318-
319-
multiprocessing.set_start_method("fork", force=True)
320-
p = multiprocessing.Process(target=_init_lfs_data_wrapper)
321-
p.start()

backend/api/services/properties.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,7 @@ async def get_data(self) -> pd.DataFrame:
2020
if self._data is not None:
2121
return self._data
2222

23-
properties_full_path = (
24-
Path(config.LFS_CLONED_REPO_PATH) / "data" / config.PROPERTIES_PATH
25-
)
23+
properties_full_path = Path(config.DATA_PATH) / config.PROPERTIES_PATH
2624
body, _ = get_local_file_content(properties_full_path)
2725
if body is None:
2826
raise HTTPException(
@@ -38,10 +36,7 @@ async def get_stone_data(self, wall_id: str) -> pd.DataFrame:
3836
return self._stone_data[wall_id]
3937

4038
properties_full_path = (
41-
Path(config.LFS_CLONED_REPO_PATH)
42-
/ "data"
43-
/ config.STONE_PROPERTIES_DIR_PATH
44-
/ f"{wall_id}.csv"
39+
Path(config.DATA_PATH) / config.STONE_PROPERTIES_DIR_PATH / f"{wall_id}.csv"
4540
)
4641
body, _ = get_local_file_content(properties_full_path)
4742
if body is None:

0 commit comments

Comments
 (0)