Skip to content

Commit

Permalink
changes
Browse files Browse the repository at this point in the history
  • Loading branch information
sophie460 committed Oct 2, 2024
1 parent 32f83b2 commit 7716b56
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 11 deletions.
5 changes: 4 additions & 1 deletion requirements-dev.lock
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ packaging==24.1
# via matplotlib
# via mne
# via pooch
# via tensorboard
# via transformers
pandas==2.2.3
# via datasets
Expand Down Expand Up @@ -170,7 +171,7 @@ soundfile==0.12.1
# via repo-sophie-1
sympy==1.13.3
# via torch
tensorboard==2.17.0
tensorboard==2.18.0
# via repo-sophie-1
tensorboard-data-server==0.7.2
# via tensorboard
Expand Down Expand Up @@ -203,6 +204,8 @@ xxhash==3.5.0
# via evaluate
yarl==1.13.1
# via aiohttp
pip==24.2
# via repo-sophie-1
setuptools==75.1.0
# via tensorboard
# via torch
5 changes: 4 additions & 1 deletion requirements.lock
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ packaging==24.1
# via matplotlib
# via mne
# via pooch
# via tensorboard
# via transformers
pandas==2.2.3
# via datasets
Expand Down Expand Up @@ -170,7 +171,7 @@ soundfile==0.12.1
# via repo-sophie-1
sympy==1.13.3
# via torch
tensorboard==2.17.0
tensorboard==2.18.0
# via repo-sophie-1
tensorboard-data-server==0.7.2
# via tensorboard
Expand Down Expand Up @@ -203,6 +204,8 @@ xxhash==3.5.0
# via evaluate
yarl==1.13.1
# via aiohttp
pip==24.2
# via repo-sophie-1
setuptools==75.1.0
# via tensorboard
# via torch
18 changes: 9 additions & 9 deletions speech_training_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,23 +113,23 @@ def parse_args():
parser.add_argument(
"--dataset_name",
type=str,
default="MLCommons/peoples_speech",
default="hf-internal-testing/librispeech_asr_dummy", # MLCommons/peoples_speech
help="The name of the dataset to use (via the datasets library).",
)
parser.add_argument(
"--dataset_config_names",
nargs="+",
type=str,
required=False,
default=["default"],
default=["clean"],
help="The configuration names of the dataset to use (via the datasets library).",
)
parser.add_argument(
"--dataset_split_names",
nargs="+",
type=str,
required=False,
default=["validation[2%:]", "test[:2%]"], #each about 600h (30k in total)
default=["validation", "test"], #each about 600h (30k in total)
help="The names of the training data set splits to use (via the datasets library).",
)
parser.add_argument(
Expand Down Expand Up @@ -164,7 +164,7 @@ def parse_args():
parser.add_argument(
"--validation_split_percentage",
type=int,
default=1,
default=10,
help="Percentage of training data that should be used for validation if no validation is present in dataset.",
)
parser.add_argument(
Expand All @@ -176,7 +176,7 @@ def parse_args():
parser.add_argument(
"--saving_steps",
type=int,
default=10000,
default=10,
help="Number of steps between each logging",
)
parser.add_argument(
Expand Down Expand Up @@ -233,7 +233,7 @@ def parse_args():
parser.add_argument(
"--max_train_steps",
type=int,
default=20000,
default=200,
help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
)
parser.add_argument(
Expand Down Expand Up @@ -520,7 +520,7 @@ def main():
trust_remote_code=True, #################################################################################################################################
)
datasets_splits.append(dataset_split)
datasets_splits.to(device) ###############################################################################################################################
datasets_splits ###############################################################################################################################

# Next, we concatenate all configurations and splits into a single training dataset
raw_datasets = DatasetDict()
Expand Down Expand Up @@ -704,7 +704,7 @@ def prepare_dataset(batch):
percent_masked = num_losses / sub_attention_mask.sum()

# forward
outputs = model(**batch).to(device)
outputs = model(**batch) # .to(device)

# divide loss by gradient accumulation steps since gradients
# are accumulated for multiple backward passes in PyTorch
Expand Down Expand Up @@ -759,7 +759,7 @@ def prepare_dataset(batch):
writer.add_scalar("loss/train", float((loss * args.gradient_accumulation_steps) / num_losses), step)
writer.flush()
print("yay its logging")

progress_bar.update(1)
completed_steps += 1

Expand Down

0 comments on commit 7716b56

Please sign in to comment.