Table 4. WER from each hyperparameter model on WSJ dev93 and eval92

Hyperparameters		Values / WER
MODEL	init	chainer	xavier uniform	xavier normal	kaiming uniform	kaiming normal
	init	42.0/35.1	17.0/12.7	17.3/14.0	17.7/13.1	17.6/13.4
	warmup steps	10,000	20,000	30,000	40,000
	warmup steps	15.6/12.4	16.0/12.8	17.3/12.7	17.3/13.6
	keep nbest model	5	10	15	20
	keep nbest model	17.0/12.8	17.3/12.7	16.9/13.0	16.9/13.4
	ctc weight		0.1	0.2	0.3	0.4
	ctc weight		17.3/13.6	17.0/13.1	17.3/12.7	16.5/13.0
	lsm weight		0.1	0.2	0.3	0.4
	lsm weight		17.3/12.7	17.3/13.2	17.5/12.9	18.0/13.3
	length normalized loss	true	false
	length normalized loss	17.7/14.0	17.3/12.7
ENCODER	attention heads	1	2	4	8
	attention heads	17.6/13.3	16.7/13.0	17.3/12.7	17.6/13.4
	linear units	512	1,024	2,048	4,096
	linear units	18.9/14.8	18.0/14.0	17.3/12.7	16.3/12.3
	num blocks	2	4	6	8	12
	num blocks	24.5/19.7	20.2/16.0	18.5/15.0	17.3/13.5	17.3/12.7
	dropout rate	0.0	0.1	0.2	0.3	0.4
	dropout rate	17.4/14.4	17.3/12.7	17.7/13.4	17.8/13.7	20.4/15.7
	attention dropout rate	0.0	0.1	0.2	0.3	0.4
	attention dropout rate	17.3/12.7	16.5/13.0	15.6/12.8	15.8/12.6	15.9/12.7
	normalized before	true	false
	normalized before	17.3/12.7	14.1
DECODER	attention heads	1	2	4	8
	attention heads	17.3/13.0	17.1/12.9	17.3/12.7	17.6/12.8
	linear units	512	1,024	2,048	4,096
	linear units	17.7/13.5	17.5/13.4	17.2/12.7	16.9/12.9
	num blocks	2	4	6	8	12
	num blocks	19.7/16.0	17.2/13.6	17.3/12.7	16.3/12.5	16.3/12.5
	dropout rate	0.0	0.1	0.2	0.3	0.4
	dropout rate	16.5/13.3	17.3/12.7	16.9/13.8	16.3/13.6	17.5/13.5
	self attention dropout rate	0.0	0.1	0.2	0.3	0.4
	self attention dropout rate	17.3/12.7	16.5/13.8	17.0/13.9	16.6/13.8	16.5/13.5

WER, word error rate.