Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
type=lambda s: [str(mod) for mod in s.split(',') if s != ''],
help='List of decoder modules to initialize, separated by a comma.')
# multilingual related
parser.add_argument('--multilingual', default=False, type=strtobool,
help='Prepend target language ID to the source sentence. \
Both source/target language IDs must be prepend in the pre-processing stage.')
parser.add_argument('--replace-sos', default=False, type=strtobool,
help='Replace in the decoder with a target language ID \
(the first token in the target sequence)')
# Feature transform: Normalization
parser.add_argument('--stats-file', type=str, default=None,
help='The stats file for the feature normalization')
parser.add_argument('--apply-uttmvn', type=strtobool, default=True,
help='Apply utterance level mean '
'variance normalization.')
parser.add_argument('--uttmvn-norm-means', type=strtobool,
default=True, help='')
parser.add_argument('--uttmvn-norm-vars', type=strtobool, default=False,
help='')
# Feature transform: Fbank
parser.add_argument('--fbank-fs', type=int, default=16000,
help='The sample frequency used for '
'the mel-fbank creation.')
parser.add_argument('--n-mels', type=int, default=80,
help='The number of mel-frequency bins.')
parser.add_argument('--fbank-fmin', type=float, default=0.,
help='')
parser.add_argument('--fbank-fmax', type=float, default=None,
help='')
return parser
parser.add_argument('--criterion', default='acc', type=str,
choices=['loss', 'acc'],
help='Criterion to perform epsilon decay')
parser.add_argument('--threshold', default=1e-4, type=float,
help='Threshold to stop iteration')
parser.add_argument('--epochs', '-e', default=30, type=int,
help='Maximum number of epochs')
parser.add_argument('--early-stop-criterion', default='validation/main/acc', type=str, nargs='?',
help="Value to monitor to trigger an early stopping of the training")
parser.add_argument('--patience', default=3, type=int, nargs='?',
help="Number of epochs to wait without improvement before stopping the training")
parser.add_argument('--grad-clip', default=5, type=float,
help='Gradient norm threshold to clip')
parser.add_argument('--num-save-attention', default=3, type=int,
help='Number of samples of attention to be saved')
parser.add_argument('--grad-noise', type=strtobool, default=False,
help='The flag to switch to use noise injection to gradients during training')
# speech translation related
parser.add_argument('--context-residual', default=False, type=strtobool, nargs='?',
help='The flag to switch to use context vector residual in the decoder network')
# finetuning related
parser.add_argument('--enc-init', default=None, type=str, nargs='?',
help='Pre-trained ASR model to initialize encoder.')
parser.add_argument('--enc-init-mods', default='enc.enc.',
type=lambda s: [str(mod) for mod in s.split(',') if s != ''],
help='List of encoder modules to initialize, separated by a comma.')
parser.add_argument('--dec-init', default=None, type=str, nargs='?',
help='Pre-trained ASR, MT or LM model to initialize decoder.')
parser.add_argument('--dec-init-mods', default='att., dec.',
type=lambda s: [str(mod) for mod in s.split(',') if s != ''],
help='List of decoder modules to initialize, separated by a comma.')
# multilingual related
parser.add_argument('--criterion', default='loss', type=str,
choices=['loss'],
help='Criterion to perform epsilon decay')
parser.add_argument('--threshold', default=1e-4, type=float,
help='Threshold to stop iteration')
parser.add_argument('--epochs', '-e', default=30, type=int,
help='Maximum number of epochs')
parser.add_argument('--early-stop-criterion', default='validation/main/loss', type=str, nargs='?',
help="Value to monitor to trigger an early stopping of the training")
parser.add_argument('--patience', default=3, type=int, nargs='?',
help="Number of epochs to wait without improvement before stopping the training")
parser.add_argument('--grad-clip', default=5, type=float,
help='Gradient norm threshold to clip')
parser.add_argument('--num-save-attention', default=3, type=int,
help='Number of samples of attention to be saved')
parser.add_argument('--grad-noise', type=strtobool, default=False,
help='The flag to switch to use noise injection to gradients during training')
# finetuning related
parser.add_argument('--enc-init', default=None, type=str,
help='Initialize encoder model part from pre-trained ESPNET ASR model.')
parser.add_argument('--enc-init-mods', default='enc.enc.',
type=lambda s: [str(mod) for mod in s.split(',') if s != ''],
help='List of encoder modules to initialize, separated by a comma.')
parser.add_argument('--dec-init', default=None, type=str,
help='Initialize decoder model part from pre-trained ESPNET ASR or LM model.')
parser.add_argument('--dec-init-mods', default='att.,dec.decoder.,dec.att.,dec.embed.',
type=lambda s: [str(mod) for mod in s.split(',') if s != ''],
help='List of decoder modules to initialize, separated by a comma.')
parser.add_argument('--freeze-modules', default='',
type=lambda s: [str(mod) for mod in s.split(',') if s != ''],
help='List of modules to freeze, separated by a comma.')
# speech translation related
help='Maximum bins in a minibatch (0 to disable)')
parser.add_argument('--batch-frames-in', default=0, type=int,
help='Maximum input frames in a minibatch (0 to disable)')
parser.add_argument('--batch-frames-out', default=0, type=int,
help='Maximum output frames in a minibatch (0 to disable)')
parser.add_argument('--batch-frames-inout', default=0, type=int,
help='Maximum input+output frames in a minibatch (0 to disable)')
parser.add_argument('--maxlen-in', '--batch-seq-maxlen-in', default=100, type=int, metavar='ML',
help='When --batch-count=seq, batch size is reduced if the input sequence length > ML.')
parser.add_argument('--maxlen-out', '--batch-seq-maxlen-out', default=200, type=int, metavar='ML',
help='When --batch-count=seq, batch size is reduced if the output sequence length > ML')
parser.add_argument('--num-iter-processes', default=0, type=int,
help='Number of processes of iterator')
parser.add_argument('--preprocess-conf', type=str, default=None,
help='The configuration file for the pre-processing')
parser.add_argument('--use-speaker-embedding', default=False, type=strtobool,
help='Whether to use speaker embedding')
parser.add_argument('--use-second-target', default=False, type=strtobool,
help='Whether to use second target')
# optimization related
parser.add_argument('--opt', default='adam', type=str,
choices=['adam', 'noam'],
help='Optimizer')
parser.add_argument('--accum-grad', default=1, type=int,
help='Number of gradient accumuration')
parser.add_argument('--lr', default=1e-3, type=float,
help='Learning rate for optimizer')
parser.add_argument('--eps', default=1e-6, type=float,
help='Epsilon for optimizer')
parser.add_argument('--weight-decay', default=1e-6, type=float,
help='Weight decay coefficient for optimizer')
parser.add_argument('--epochs', '-e', default=30, type=int,
help="Number of postnet layers")
group.add_argument("--postnet-chans", default=256, type=int,
help="Number of postnet channels")
group.add_argument("--postnet-filts", default=5, type=int,
help="Filter size of postnet")
group.add_argument("--use-batch-norm", default=True, type=strtobool,
help="Whether to use batch normalization")
group.add_argument("--use-scaled-pos-enc", default=True, type=strtobool,
help="Use trainable scaled positional encoding instead of the fixed scale one")
group.add_argument("--encoder-normalize-before", default=False, type=strtobool,
help="Whether to apply layer norm before encoder block")
group.add_argument("--decoder-normalize-before", default=False, type=strtobool,
help="Whether to apply layer norm before decoder block")
group.add_argument("--encoder-concat-after", default=False, type=strtobool,
help="Whether to concatenate attention layer's input and output in encoder")
group.add_argument("--decoder-concat-after", default=False, type=strtobool,
help="Whether to concatenate attention layer's input and output in decoder")
group.add_argument("--duration-predictor-layers", default=2, type=int,
help="Number of layers in duration predictor")
group.add_argument("--duration-predictor-chans", default=384, type=int,
help="Number of channels in duration predictor")
group.add_argument("--duration-predictor-kernel-size", default=3, type=int,
help="Kernel size in duration predictor")
group.add_argument("--teacher-model", default=None, type=str, nargs="?",
help="Teacher model file path")
group.add_argument("--reduction-factor", default=1, type=int,
help="Reduction factor")
group.add_argument("--spk-embed-dim", default=None, type=int,
help="Number of speaker embedding dimensions")
group.add_argument("--spk-embed-integration-type", type=str, default="add",
choices=["add", "concat"],
help="How to integrate speaker embedding")
group.add_argument("--positionwise-layer-type", default="linear", type=str,
choices=["linear", "conv1d", "conv1d-linear"],
help="Positionwise layer type.")
group.add_argument("--positionwise-conv-kernel-size", default=3, type=int,
help="Kernel size of positionwise conv1d layer")
group.add_argument("--postnet-layers", default=0, type=int,
help="Number of postnet layers")
group.add_argument("--postnet-chans", default=256, type=int,
help="Number of postnet channels")
group.add_argument("--postnet-filts", default=5, type=int,
help="Filter size of postnet")
group.add_argument("--use-batch-norm", default=True, type=strtobool,
help="Whether to use batch normalization")
group.add_argument("--use-scaled-pos-enc", default=True, type=strtobool,
help="Use trainable scaled positional encoding instead of the fixed scale one")
group.add_argument("--encoder-normalize-before", default=False, type=strtobool,
help="Whether to apply layer norm before encoder block")
group.add_argument("--decoder-normalize-before", default=False, type=strtobool,
help="Whether to apply layer norm before decoder block")
group.add_argument("--encoder-concat-after", default=False, type=strtobool,
help="Whether to concatenate attention layer's input and output in encoder")
group.add_argument("--decoder-concat-after", default=False, type=strtobool,
help="Whether to concatenate attention layer's input and output in decoder")
group.add_argument("--duration-predictor-layers", default=2, type=int,
help="Number of layers in duration predictor")
group.add_argument("--duration-predictor-chans", default=384, type=int,
help="Number of channels in duration predictor")
group.add_argument("--duration-predictor-kernel-size", default=3, type=int,
help="Kernel size in duration predictor")
group.add_argument("--teacher-model", default=None, type=str, nargs="?",
help="Teacher model file path")
group.add_argument("--reduction-factor", default=1, type=int,
def ctc_add_arguments(parser):
"""Add arguments for ctc in multi-encoder setting."""
group = parser.add_argument_group("E2E multi-ctc setting")
group.add_argument('--share-ctc', type=strtobool, default=False,
help='The flag to switch to share ctc across multiple encoders '
'(multi-encoder asr mode only).')
group.add_argument('--weights-ctc-train', type=float, action='append',
help='ctc weight assigned to each encoder during training.')
group.add_argument('--weights-ctc-dec', type=float, action='append',
help='ctc weight assigned to each encoder during decoding.')
return parser
help='Dropout rate for the decoder')
# prediction
group.add_argument('--dec-embed-dim', default=320, type=int,
help='Number of decoder embeddings dimensions')
parser.add_argument('--dropout-rate-embed-decoder', default=0.0, type=float,
help='Dropout rate for the decoder embeddings')
# general
group.add_argument('--rnnt_type', default='warp-transducer', type=str,
choices=['warp-transducer'],
help='Type of transducer implementation to calculate loss.')
parser.add_argument('--rnnt-mode', default='rnnt', type=str, choices=['rnnt', 'rnnt-att'],
help='RNN-Transducing mode')
parser.add_argument('--joint-dim', default=320, type=int,
help='Number of dimensions in joint space')
# decoding
parser.add_argument('--score-norm-transducer', type=strtobool, nargs='?',
default=True,
help='Normalize transducer scores by length')
group.add_argument("--dlayers", default=6, type=int,
help="Number of decoder layers")
group.add_argument("--dunits", default=1536, type=int,
help="Number of decoder hidden units")
group.add_argument("--positionwise-layer-type", default="linear", type=str,
choices=["linear", "conv1d", "conv1d-linear"],
help="Positionwise layer type.")
group.add_argument("--positionwise-conv-kernel-size", default=3, type=int,
help="Kernel size of positionwise conv1d layer")
group.add_argument("--postnet-layers", default=0, type=int,
help="Number of postnet layers")
group.add_argument("--postnet-chans", default=256, type=int,
help="Number of postnet channels")
group.add_argument("--postnet-filts", default=5, type=int,
help="Filter size of postnet")
group.add_argument("--use-batch-norm", default=True, type=strtobool,
help="Whether to use batch normalization")
group.add_argument("--use-scaled-pos-enc", default=True, type=strtobool,
help="Use trainable scaled positional encoding instead of the fixed scale one")
group.add_argument("--encoder-normalize-before", default=False, type=strtobool,
help="Whether to apply layer norm before encoder block")
group.add_argument("--decoder-normalize-before", default=False, type=strtobool,
help="Whether to apply layer norm before decoder block")
group.add_argument("--encoder-concat-after", default=False, type=strtobool,
help="Whether to concatenate attention layer's input and output in encoder")
group.add_argument("--decoder-concat-after", default=False, type=strtobool,
help="Whether to concatenate attention layer's input and output in decoder")
group.add_argument("--duration-predictor-layers", default=2, type=int,
help="Number of layers in duration predictor")
group.add_argument("--duration-predictor-chans", default=384, type=int,
help="Number of channels in duration predictor")
group.add_argument("--duration-predictor-kernel-size", default=3, type=int,
parser.add_argument('--bprojs', type=int, default=300,
help='')
parser.add_argument('--badim', type=int, default=320,
help='')
parser.add_argument('--bnmask', type=int, default=2,
help='Number of beamforming masks, '
'default is 2 for [speech, noise].')
parser.add_argument('--ref-channel', type=int, default=-1,
help='The reference channel used for beamformer. '
'By default, the channel is estimated by DNN.')
parser.add_argument('--bdropout-rate', type=float, default=0.0,
help='')
# Feature transform: Normalization
parser.add_argument('--stats-file', type=str, default=None,
help='The stats file for the feature normalization')
parser.add_argument('--apply-uttmvn', type=strtobool, default=True,
help='Apply utterance level mean '
'variance normalization.')
parser.add_argument('--uttmvn-norm-means', type=strtobool,
default=True, help='')
parser.add_argument('--uttmvn-norm-vars', type=strtobool, default=False,
help='')
# Feature transform: Fbank
parser.add_argument('--fbank-fs', type=int, default=16000,
help='The sample frequency used for '
'the mel-fbank creation.')
parser.add_argument('--n-mels', type=int, default=80,
help='The number of mel-frequency bins.')
parser.add_argument('--fbank-fmin', type=float, default=0.,
help='')
parser.add_argument('--fbank-fmax', type=float, default=None,
help='')