Skip to content

Commit 528b485

Browse files
Remove nemo imports (#594)
Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
1 parent 0997912 commit 528b485

11 files changed

Lines changed: 60 additions & 836 deletions

nemo_deploy/service/fastapi_interface_to_pytriton_multimodal.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515
import json
16+
import logging
1617
import os
1718
from typing import List, Optional
1819

@@ -24,12 +25,7 @@
2425

2526
from nemo_deploy.multimodal.query_multimodal import NemoQueryMultimodalPytorch
2627

27-
try:
28-
from nemo.utils import logging
29-
except (ImportError, ModuleNotFoundError):
30-
import logging
31-
32-
logging = logging.getLogger(__name__)
28+
logger = logging.getLogger(__name__)
3329

3430

3531
class TritonSettings(BaseSettings):
@@ -44,8 +40,8 @@ def __init__(self):
4440
self._triton_service_port = int(os.environ.get("TRITON_PORT", 8000))
4541
self._triton_service_ip = os.environ.get("TRITON_HTTP_ADDRESS", "0.0.0.0")
4642
except Exception as error:
47-
logging.error(
48-
"An exception occurred trying to retrieve set args in TritonSettings class. Error:",
43+
logger.error(
44+
"An exception occurred trying to retrieve set args in TritonSettings class. Error: %s",
4945
error,
5046
)
5147
return
@@ -146,7 +142,7 @@ async def check_triton_health():
146142
triton_url = (
147143
f"http://{triton_settings.triton_service_ip}:{str(triton_settings.triton_service_port)}/v2/health/ready"
148144
)
149-
logging.info(f"Attempting to connect to Triton server at: {triton_url}")
145+
logger.info(f"Attempting to connect to Triton server at: {triton_url}")
150146
try:
151147
response = requests.get(triton_url, timeout=5)
152148
if response.status_code == 200:
@@ -271,7 +267,7 @@ async def completions_v1(request: MultimodalCompletionRequest):
271267

272268
output_serializable = convert_numpy(output)
273269
output_serializable["choices"][0]["text"] = output_serializable["choices"][0]["text"][0][0]
274-
logging.info(f"Output: {output_serializable}")
270+
logger.info(f"Output: {output_serializable}")
275271
return output_serializable
276272

277273

@@ -349,5 +345,5 @@ async def chat_completions_v1(request: MultimodalChatCompletionRequest):
349345
0
350346
][0]
351347

352-
logging.info(f"Output: {output_serializable}")
348+
logger.info(f"Output: {output_serializable}")
353349
return output_serializable

nemo_export/onnx_llm_exporter.py

Lines changed: 19 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414

1515

16+
import logging
1617
import warnings
1718
from pathlib import Path
1819
from typing import Any, Callable, Dict, List, Optional, Union
@@ -35,12 +36,7 @@
3536
UnavailableError,
3637
)
3738

38-
try:
39-
from nemo.utils import logging
40-
except (ImportError, ModuleNotFoundError):
41-
import logging
42-
43-
logging = logging.getLogger(__name__)
39+
logger = logging.getLogger(__name__)
4440

4541
try:
4642
import modelopt.torch.quantization as mtq
@@ -90,15 +86,15 @@ def wrapper(*args, **kwargs):
9086
try:
9187
from pytriton.decorators import batch
9288
except Exception:
93-
logging.warning("PyTriton is not available.")
89+
logger.warning("PyTriton is not available.")
9490
use_pytriton = False
9591

9692

9793
use_onnxruntime = True
9894
try:
9995
import onnxruntime
10096
except Exception:
101-
logging.warning("onnxruntime is not available.")
97+
logger.warning("onnxruntime is not available.")
10298
use_onnxruntime = False
10399

104100

@@ -255,7 +251,7 @@ def _export_to_onnx(
255251
verbose=verbose,
256252
opset_version=opset,
257253
)
258-
logging.info(f"Successfully exported PyTorch model to ONNX model {self.onnx_model_path}")
254+
logger.info(f"Successfully exported PyTorch model to ONNX model {self.onnx_model_path}")
259255

260256
existing_directory_path = Path(self.onnx_model_dir) / "tokenizer"
261257
existing_directory_path.mkdir(exist_ok=True)
@@ -285,7 +281,7 @@ def export_onnx_to_trt(
285281
if not HAVE_TENSORRT:
286282
raise UnavailableError(MISSING_TENSORRT_MSG)
287283

288-
logging.info(f"Building TRT engine from ONNX model ({self.onnx_model_path})")
284+
logger.info(f"Building TRT engine from ONNX model ({self.onnx_model_path})")
289285
trt_logger = trt.Logger(trt.Logger.WARNING)
290286
builder = trt.Builder(trt_logger)
291287
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
@@ -295,9 +291,9 @@ def export_onnx_to_trt(
295291
# we use parse_from_file() instead of parse() because it can be used for both single
296292
# file models as well as externally stored models (required when model >2GiB)
297293
if not parser.parse_from_file(self.onnx_model_path):
298-
logging.warning("ONNX model could not be parsed")
294+
logger.warning("ONNX model could not be parsed")
299295
for error in range(parser.num_errors):
300-
logging.error(parser.get_error(error))
296+
logger.error(parser.get_error(error))
301297
return
302298

303299
if profiles:
@@ -316,22 +312,22 @@ def export_onnx_to_trt(
316312
config.add_optimization_profile(optimization_profile)
317313

318314
if trt_dtype == "fp16":
319-
logging.info("Setting Build Flag FP16")
315+
logger.info("Setting Build Flag FP16")
320316
config.set_flag(trt.BuilderFlag.FP16)
321317
elif trt_dtype == "fp8":
322318
# With FP8 export we want to also enable FP16 layers as a fallback instead of FP32
323-
logging.info("Setting Build Flag FP8 and FP16")
319+
logger.info("Setting Build Flag FP8 and FP16")
324320
config.set_flag(trt.BuilderFlag.FP8)
325321
config.set_flag(trt.BuilderFlag.FP16)
326322
validate_fp8_network(network)
327323

328324
# patch network
329325
if override_layernorm_precision_to_fp32:
330-
logging.info("Overriding TensorRT network LayerNorm precision to float32.")
326+
logger.info("Overriding TensorRT network LayerNorm precision to float32.")
331327
self._override_layernorm_precision_to_fp32(network)
332328

333329
if override_layers_to_fp32:
334-
logging.info("Overriding some layers to float32.")
330+
logger.info("Overriding some layers to float32.")
335331
self._override_layers_to_fp32(network, override_layers_to_fp32)
336332

337333
try:
@@ -343,7 +339,7 @@ def export_onnx_to_trt(
343339
except KeyError:
344340
error_msg = "Unknown profiling verbosity value."
345341
raise ValueError(error_msg)
346-
logging.info(f"Setting Profiling Verbosity to {config.profiling_verbosity}")
342+
logger.info(f"Setting Profiling Verbosity to {config.profiling_verbosity}")
347343

348344
if trt_builder_flags is not None:
349345
for flag in trt_builder_flags:
@@ -357,7 +353,7 @@ def export_onnx_to_trt(
357353
trt_model_path.mkdir(parents=True, exist_ok=True)
358354
trt_model_path = trt_model_path / "model.plan"
359355
trt_model_path.write_bytes(engine_string)
360-
logging.info(f"Successfully exported ONNX model ({self.onnx_model_path}) to TRT engine ({trt_model_path})")
356+
logger.info(f"Successfully exported ONNX model ({self.onnx_model_path}) to TRT engine ({trt_model_path})")
361357

362358
def _override_layer_precision_to_fp32(self, layer: trt.ILayer) -> None:
363359
if not HAVE_TENSORRT:
@@ -378,7 +374,7 @@ def _override_layers_to_fp32(self, network: trt.INetworkDefinition, fp32_layer_p
378374
trt.float16,
379375
}:
380376
if layer.type in {trt.LayerType.CAST}:
381-
logging.info(f"Skipping overriding {layer.type} layer {i} {layer_name} dtype")
377+
logger.info(f"Skipping overriding {layer.type} layer {i} {layer_name} dtype")
382378
continue
383379
if any(
384380
layer.get_input(input_idx).dtype in {trt.float32, trt.float16}
@@ -387,11 +383,11 @@ def _override_layers_to_fp32(self, network: trt.INetworkDefinition, fp32_layer_p
387383
# Note: Assigning to layer.precision (even the same value) sets precision_is_set=True,
388384
# which prevents TensorRT from changing this layer's precision
389385
layer.precision = trt.float32
390-
logging.info(f"Setting layer {i} {layer_name} (type: {layer.type}) precision to FP32")
386+
logger.info(f"Setting layer {i} {layer_name} (type: {layer.type}) precision to FP32")
391387
for j in range(layer.num_outputs):
392388
if layer.get_output_type(j) in {trt.float32, trt.float16}:
393389
layer.set_output_type(j, trt.float32)
394-
logging.info(f"Setting layer {i} {layer_name} (type: {layer.type}) output type {j} to FP32")
390+
logger.info(f"Setting layer {i} {layer_name} (type: {layer.type}) output type {j} to FP32")
395391

396392
def _override_layernorm_precision_to_fp32(self, network: trt.INetworkDefinition) -> None:
397393
"""Set the precision of LayerNorm subgraphs to FP32 to preserve accuracy.
@@ -506,9 +502,9 @@ def quantize(
506502
)
507503
quant_cfg = QUANT_CFG_CHOICES[quant_cfg]
508504

509-
logging.info("Starting quantization...")
505+
logger.info("Starting quantization...")
510506
mtq.quantize(self.model, quant_cfg, forward_loop=forward_loop)
511-
logging.info("Quantization is completed.")
507+
logger.info("Quantization is completed.")
512508

513509
@property
514510
def get_model(self):

0 commit comments

Comments
 (0)