Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ valid_dataset = auto_dataset.get_split(augs=val_augs.get_augmentations(), split=
model = ModelManager.get("fai-detr-l-obj365")

args = TrainerArgs(
run_name=f"{ds_name}-{model.model_info.name}",
run_name=f"{model.name}_{train_dataset.name}",
batch_size=16,
max_iters=50,
eval_period=50,
Expand Down
6 changes: 3 additions & 3 deletions docs/training.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,14 @@ Optionally, if you are using the hub, you can specify `sync_to_hub=True` to trac
from focoos.ports import TrainerArgs

args = TrainerArgs(
run_name="football-tutorial", # the name of the experiment
output_dir="./experiments", # the folder where the model is saved
run_name=f"{model.name}_{train_dataset.name}", # the name of the experiment
output_dir="./experiments", # the folder where the model is saved, DEFAULT ~/FocoosAI/models"
batch_size=16, # how many images in each iteration
max_iters=500, # how many iterations lasts the training
eval_period=100, # period after we eval the model on the validation (in iterations)
learning_rate=0.0001, # learning rate
weight_decay=0.0001, # regularization strenght (set it properly to avoid under/over fitting)
sync_to_hub=True, # Use this to see the model under training on the platform
sync_to_hub=True, # Use this to sync model info, weights and metrics on the platform
)
```

Expand Down
8 changes: 8 additions & 0 deletions focoos/data/datasets/map_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,14 @@ def __getitem__(self, idx):
if retry_count >= 3:
self.logger.warning("Failed to apply `_map_func` for idx: {}, retry count: {}".format(idx, retry_count))

@property
def name(self):
return self.dataset.metadata.name

@property
def task(self):
return self.dataset.metadata.task

def preview(self, index=None, use_augmentations=True):
if not use_augmentations:
current_augmentations = self.mapper.augmentations
Expand Down
7 changes: 6 additions & 1 deletion focoos/models/focoos_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ def _setup_model_for_training(self, train_args: TrainerArgs, data_train: MapData
self.model_info.config["num_classes"] = len(data_train.dataset.metadata.classes)
self._reload_model()
self.model_info.name = train_args.run_name.strip()
self.processor = ProcessorManager.get_processor(self.model_info.model_family, self.model_info.config)
assert self.model_info.task == data_train.dataset.metadata.task, "Task mismatch between model and dataset."

def train(self, args: TrainerArgs, data_train: MapDataset, data_val: MapDataset, hub: Optional[FocoosHUB] = None):
Expand Down Expand Up @@ -252,6 +253,10 @@ def test(self, args: TrainerArgs, data_test: MapDataset):
else:
run_test(args, data_test, self.model, self.processor, self.model_info)

@property
def name(self):
return self.model_info.name

@property
def device(self):
"""Get the device where the model is located.
Expand Down Expand Up @@ -299,7 +304,7 @@ def task(self):

def export(
self,
runtime_type: RuntimeType = RuntimeType.ONNX_CUDA32,
runtime_type: RuntimeType = RuntimeType.TORCHSCRIPT_32,
onnx_opset: int = 17,
out_dir: Optional[str] = None,
device: Literal["cuda", "cpu"] = "cuda",
Expand Down
33 changes: 15 additions & 18 deletions focoos/trainer/hooks/sync_to_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ def after_step(self):
)

def after_train(self):
# Catch exception and sync training info, final weights will be synced in main trainer fn
exc_type, exc_value, exc_traceback = sys.exc_info()
status = ModelStatus.TRAINING_COMPLETED
if exc_type is not None:
logger.error(
f"Exception during training, status set to TRAINING_ERROR: {str(exc_type.__name__)} {str(exc_value)}"
Expand All @@ -88,23 +88,20 @@ def after_train(self):
detail=f"{str(exc_type.__name__)}: {str(exc_value)}",
)
)

self.model_info.dump_json(os.path.join(self.output_dir, ArtifactName.INFO))
self._sync_train_job(
sync_info=HubSyncLocalTraining(
status=status,
iterations=self.iteration,
training_info=self.model_info.training_info,
),
upload_artifacts=[
ArtifactName.WEIGHTS,
ArtifactName.LOGS,
ArtifactName.PT,
ArtifactName.ONNX,
ArtifactName.INFO,
ArtifactName.METRICS,
],
)
self.model_info.dump_json(os.path.join(self.output_dir, ArtifactName.INFO))
self._sync_train_job(
sync_info=HubSyncLocalTraining(
status=status,
iterations=self.iteration,
training_info=self.model_info.training_info,
),
upload_artifacts=[
ArtifactName.WEIGHTS,
ArtifactName.LOGS,
ArtifactName.INFO,
ArtifactName.METRICS,
],
)

def _sync_train_job(self, sync_info: HubSyncLocalTraining, upload_artifacts: Optional[List[ArtifactName]] = None):
try:
Expand Down
139 changes: 71 additions & 68 deletions focoos/trainer/hooks/visualization.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,75 +125,78 @@ def _create_mosaic(self, images):
def _visualize(self):
training_mode = self.model.training

with ExitStack() as stack:
stack.enter_context(torch.no_grad())
stack.enter_context(inference_context(self.model))
stack.enter_context(inference_context(self.processor))

storage = get_event_storage()
self.model.eval()

all_visualized_images = []

for i in range(self.n_sample):
sample = self.samples[i]
sample["height"], sample["width"] = sample["image"].shape[-2:]

samples = [sample]
images, _ = self.processor.preprocess(samples, device=self.model.device, dtype=self.model.dtype)
outputs = self.model(images)
prediction = self.processor.eval_postprocess(outputs, samples)[0]

visualizer = Visualizer(
sample["image"].permute(1, 2, 0).cpu().numpy(),
self.metadata,
instance_mode=ColorMode.IMAGE,
)
if "panoptic_seg" in prediction:
panoptic_seg, segments_info = prediction["panoptic_seg"]
vis_output = visualizer.draw_panoptic_seg_predictions(
panoptic_seg.to(self.cpu_device), segments_info
try:
with ExitStack() as stack:
stack.enter_context(torch.no_grad())
stack.enter_context(inference_context(self.model))
stack.enter_context(inference_context(self.processor))

storage = get_event_storage()
self.model.eval()

all_visualized_images = []

for i in range(self.n_sample):
sample = self.samples[i]
sample["height"], sample["width"] = sample["image"].shape[-2:]

samples = [sample]
images, _ = self.processor.preprocess(samples, device=self.model.device, dtype=self.model.dtype)
outputs = self.model(images)
prediction = self.processor.eval_postprocess(outputs, samples)[0]

visualizer = Visualizer(
sample["image"].permute(1, 2, 0).cpu().numpy(),
self.metadata,
instance_mode=ColorMode.IMAGE,
)
elif "sem_seg" in prediction:
vis_output = visualizer.draw_sem_seg(prediction["sem_seg"].argmax(dim=0).to(self.cpu_device))
elif "instances" in prediction:
instances = prediction["instances"].to(self.cpu_device)
# filter based on confidence - fixed at 0.5
instances = instances[instances.scores > 0.5]
vis_output = visualizer.draw_instance_predictions(predictions=instances)
else:
vis_output = None

if vis_output is not None:
pred_img = vis_output.get_image()
# Non salviamo più i singoli samples nello storage
all_visualized_images.append(pred_img)

# Create and save mosaic if we have images and output directory
if all_visualized_images:
# Get current iteration for filename
try:
current_iter = self.trainer.iter
except (AttributeError, TypeError):
current_iter = 0

# Create mosaic
mosaic = self._create_mosaic(all_visualized_images)

if mosaic is not None:
# Salva il mosaico nello storage invece dei singoli samples
mosaic_transposed = mosaic.transpose(2, 0, 1) # HWC -> CHW
storage.put_image("Samples_Mosaic", mosaic_transposed)

# Save to disk if output_dir is provided
if self.output_dir is not None:
preview_dir = os.path.join(self.output_dir, "preview")
os.makedirs(preview_dir, exist_ok=True)

# Include iteration in filename
output_path = os.path.join(preview_dir, f"samples_iter_{current_iter}.jpg")
encode_params = [cv2.IMWRITE_JPEG_QUALITY, 80]
cv2.imwrite(output_path, mosaic, encode_params)
if "panoptic_seg" in prediction:
panoptic_seg, segments_info = prediction["panoptic_seg"]
vis_output = visualizer.draw_panoptic_seg_predictions(
panoptic_seg.to(self.cpu_device), segments_info
)
elif "sem_seg" in prediction:
vis_output = visualizer.draw_sem_seg(prediction["sem_seg"].argmax(dim=0).to(self.cpu_device))
elif "instances" in prediction:
instances = prediction["instances"].to(self.cpu_device)
# filter based on confidence - fixed at 0.5
instances = instances[instances.scores > 0.5]
vis_output = visualizer.draw_instance_predictions(predictions=instances)
else:
vis_output = None

if vis_output is not None:
pred_img = vis_output.get_image()
# Non salviamo più i singoli samples nello storage
all_visualized_images.append(pred_img)

# Create and save mosaic if we have images and output directory
if all_visualized_images:
# Get current iteration for filename
try:
current_iter = self.trainer.iter
except (AttributeError, TypeError):
current_iter = 0

# Create mosaic
mosaic = self._create_mosaic(all_visualized_images)

if mosaic is not None:
# Salva il mosaico nello storage invece dei singoli samples
mosaic_transposed = mosaic.transpose(2, 0, 1) # HWC -> CHW
storage.put_image("Samples_Mosaic", mosaic_transposed)

# Save to disk if output_dir is provided
if self.output_dir is not None:
preview_dir = os.path.join(self.output_dir, "preview")
os.makedirs(preview_dir, exist_ok=True)

# Include iteration in filename
output_path = os.path.join(preview_dir, f"samples_iter_{current_iter}.jpg")
encode_params = [cv2.IMWRITE_JPEG_QUALITY, 80]
cv2.imwrite(output_path, mosaic, encode_params)
except Exception as e:
logger.warning(f"Exception during visualization hook: {e}")

# set model back to training mode
self.model.train(training_mode)
Expand Down
17 changes: 14 additions & 3 deletions focoos/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from focoos.hub.remote_model import RemoteModel
from focoos.models.focoos_model import BaseModelNN
from focoos.nn.layers.norm import FrozenBatchNorm2d
from focoos.ports import ArtifactName, ModelInfo, ModelStatus, Task, TrainerArgs, TrainingInfo
from focoos.ports import ArtifactName, HubSyncLocalTraining, ModelInfo, ModelStatus, Task, TrainerArgs, TrainingInfo
from focoos.processor.base_processor import Processor
from focoos.trainer.checkpointer import Checkpointer
from focoos.trainer.evaluation.evaluator import inference_on_dataset
Expand Down Expand Up @@ -478,6 +478,19 @@ def train(self):
trainer_loop.train(start_iter=start_iter, max_iter=args.max_iters)
self.finished = True
self.finish()
if comm.is_main_process() and self.remote_model and self.args.sync_to_hub:
self.remote_model.sync_local_training_job(
local_training_info=HubSyncLocalTraining(
status=ModelStatus.TRAINING_COMPLETED,
iterations=self.args.max_iters,
training_info=self.model_info.training_info,
),
dir=self.output_dir,
upload_artifacts=[
ArtifactName.WEIGHTS,
ArtifactName.METRICS,
],
)

def test(self, restore_best: bool = False):
"""Run model evaluation on test set.
Expand Down Expand Up @@ -516,8 +529,6 @@ def test(self, restore_best: bool = False):
):
self.model_info.val_metrics = raw_metrics

self.finished = True
self.finish()
return eval_result

def _update_training_info_and_dump(self, new_status: ModelStatus, detail: Optional[str] = None):
Expand Down
9 changes: 4 additions & 5 deletions focoos/utils/visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,11 +487,10 @@ def _create_text_labels(classes, scores, class_names, is_crowd=None):
list[str] or None
"""
labels = None
if classes is not None:
if class_names is not None and len(class_names) > 0:
labels = [class_names[i] for i in classes]
else:
labels = [str(i) for i in classes]
if classes is not None and class_names is not None:
labels = [class_names[i] if i < len(class_names) else str(i) for i in classes]
else:
labels = [str(i) for i in classes]
if scores is not None:
if labels is None:
labels = ["{:.0f}%".format(s * 100) for s in scores]
Expand Down
2 changes: 0 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ dependencies = [
"pycocotools~=2.0.8",
"faster_coco_eval~=1.6.6",
"tensorboard~=2.19.0",

"orjson~=3.10.18",
"gradio~=5.31.0",
"torch~=2.7.0",
Expand All @@ -70,7 +69,6 @@ keywords = [
tensorrt = ["tensorrt==10.5.0"]
onnx = ["onnxruntime-gpu==1.22.0", "onnx>=1.17.0", "onnxslim~=0.1.54", "onnxscript~=0.2.7"]
onnx-cpu = ["onnxruntime==1.22.0","onnx>=1.18.0", "onnxslim~=0.1.54", "onnxscript~=0.2.7"]

dev = [
"pytest",
"pytest-cov",
Expand Down
6 changes: 4 additions & 2 deletions tutorials/hub.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@
" model = ModelManager.get(\"fai-detr-l-obj365\")\n",
"\n",
" args = TrainerArgs(\n",
" run_name=f\"{remote_dataset.name}-{model.model_info.name}\",\n",
" run_name=f\"{model.name}_{train_dataset.name}\",\n",
" output_dir=\"./experiments\",\n",
" amp_enabled=True,\n",
" batch_size=16,\n",
Expand All @@ -221,7 +221,9 @@
" sync_to_hub=True, # use this to sync model info, weights and metrics on the hub\n",
" )\n",
"\n",
" model.train(args, train_dataset, valid_dataset, hub=hub)"
" model.train(\n",
" args, train_dataset, valid_dataset, hub=hub\n",
" ) # Hub is optional, if not provided and sync_to_hub is True, will be created automatically"
]
}
],
Expand Down
10 changes: 4 additions & 6 deletions tutorials/inference.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,10 @@
]
},
{
"cell_type": "raw",
"metadata": {
"vscode": {
"languageId": "raw"
}
},
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install 'focoos @ git+https://github.com/FocoosAI/focoos.git'"
]
Expand Down
Loading
Loading