€•Œ%torch.distributed.checkpoint.metadata”ŒMetadata”“”)”}”(Œstate_dict_metadata”}”(Œmodel.freqs_cis”hŒTensorStorageMetadata”“”)”}”(Œ properties”hŒTensorProperties”“”)”(Œtorch”Œ complex64”“”Œtorch.serialization”Œ _get_layout”“”Œ torch.strided”…”R”‰hŒ_MEM_FORMAT_ENCODING”“”K…”R”‰t”bŒsize”Œtorch”ŒSize”“”MK †”…”R”Œchunks”]”hŒChunkStorageMetadata”“”)”}”(Œoffsets”h!KK†”…”R”Œsizes”h!MK †”…”R”ubaubŒmodel.tok_embeddings.weight”h )”}”(h h)”(hŒfloat32”“”h‰h‰t”bhh!M}M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!M@M†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!M@M†”…”R”ubh()”}”(h+h!M€>K†”…”R”h/h!M@M†”…”R”ubh()”}”(h+h!MÀ]K†”…”R”h/h!M@M†”…”R”ubeubŒ"model.layers.0.attention.wq.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.0.attention.wk.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.0.attention.wv.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.0.attention.wo.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ%model.layers.0.feed_forward.w1.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ%model.layers.0.feed_forward.w2.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ%model.layers.0.feed_forward.w3.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ$model.layers.0.attention_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒmodel.layers.0.ffn_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ"model.layers.1.attention.wq.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.1.attention.wk.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.1.attention.wv.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.1.attention.wo.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ%model.layers.1.feed_forward.w1.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ%model.layers.1.feed_forward.w2.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ%model.layers.1.feed_forward.w3.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ$model.layers.1.attention_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒmodel.layers.1.ffn_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ"model.layers.2.attention.wq.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.2.attention.wk.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.2.attention.wv.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.2.attention.wo.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ%model.layers.2.feed_forward.w1.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ%model.layers.2.feed_forward.w2.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ%model.layers.2.feed_forward.w3.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ$model.layers.2.attention_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒmodel.layers.2.ffn_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ"model.layers.3.attention.wq.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.3.attention.wk.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.3.attention.wv.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.3.attention.wo.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ%model.layers.3.feed_forward.w1.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ%model.layers.3.feed_forward.w2.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ%model.layers.3.feed_forward.w3.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ$model.layers.3.attention_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒmodel.layers.3.ffn_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ"model.layers.4.attention.wq.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.4.attention.wk.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.4.attention.wv.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.4.attention.wo.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ%model.layers.4.feed_forward.w1.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ%model.layers.4.feed_forward.w2.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ%model.layers.4.feed_forward.w3.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ$model.layers.4.attention_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒmodel.layers.4.ffn_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ"model.layers.5.attention.wq.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.5.attention.wk.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.5.attention.wv.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.5.attention.wo.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ%model.layers.5.feed_forward.w1.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ%model.layers.5.feed_forward.w2.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ%model.layers.5.feed_forward.w3.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ$model.layers.5.attention_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒmodel.layers.5.ffn_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ"model.layers.6.attention.wq.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.6.attention.wk.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.6.attention.wv.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.6.attention.wo.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ%model.layers.6.feed_forward.w1.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ%model.layers.6.feed_forward.w2.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ%model.layers.6.feed_forward.w3.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ$model.layers.6.attention_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒmodel.layers.6.ffn_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ"model.layers.7.attention.wq.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.7.attention.wk.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.7.attention.wv.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.7.attention.wo.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ%model.layers.7.feed_forward.w1.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ%model.layers.7.feed_forward.w2.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ%model.layers.7.feed_forward.w3.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ$model.layers.7.attention_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒmodel.layers.7.ffn_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ"model.layers.8.attention.wq.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.8.attention.wk.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.8.attention.wv.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.8.attention.wo.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ%model.layers.8.feed_forward.w1.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ%model.layers.8.feed_forward.w2.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ%model.layers.8.feed_forward.w3.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ$model.layers.8.attention_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒmodel.layers.8.ffn_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ"model.layers.9.attention.wq.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.9.attention.wk.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.9.attention.wv.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ"model.layers.9.attention.wo.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ%model.layers.9.feed_forward.w1.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ%model.layers.9.feed_forward.w2.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ%model.layers.9.feed_forward.w3.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ$model.layers.9.attention_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒmodel.layers.9.ffn_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ#model.layers.10.attention.wq.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ#model.layers.10.attention.wk.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ#model.layers.10.attention.wv.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ#model.layers.10.attention.wo.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ&model.layers.10.feed_forward.w1.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ&model.layers.10.feed_forward.w2.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ&model.layers.10.feed_forward.w3.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ%model.layers.10.attention_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒmodel.layers.10.ffn_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ#model.layers.11.attention.wq.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ#model.layers.11.attention.wk.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ#model.layers.11.attention.wv.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ#model.layers.11.attention.wo.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ&model.layers.11.feed_forward.w1.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ&model.layers.11.feed_forward.w2.weight”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ&model.layers.11.feed_forward.w3.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ%model.layers.11.attention_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒmodel.layers.11.ffn_norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒmodel.norm.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒmodel.output.weight”h )”}”(h h)”(h8h‰h‰t”bhh!M}M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!M@M†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!M@M†”…”R”ubh()”}”(h+h!M€>K†”…”R”h/h!M@M†”…”R”ubh()”}”(h+h!MÀ]K†”…”R”h/h!M@M†”…”R”ubeubŒ-optimizer.state.tok_embeddings.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M}M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!M@M†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!M@M†”…”R”ubh()”}”(h+h!M€>K†”…”R”h/h!M@M†”…”R”ubh()”}”(h+h!MÀ]K†”…”R”h/h!M@M†”…”R”ubeubŒ0optimizer.state.tok_embeddings.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M}M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!M@M†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!M@M†”…”R”ubh()”}”(h+h!M€>K†”…”R”h/h!M@M†”…”R”ubh()”}”(h+h!MÀ]K†”…”R”h/h!M@M†”…”R”ubeubŒ4optimizer.state.layers.0.attention.wq.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.0.attention.wq.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.0.attention.wk.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.0.attention.wk.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.0.attention.wv.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.0.attention.wv.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.0.attention.wo.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.0.attention.wo.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.0.feed_forward.w1.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.0.feed_forward.w1.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ7optimizer.state.layers.0.feed_forward.w2.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ:optimizer.state.layers.0.feed_forward.w2.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ7optimizer.state.layers.0.feed_forward.w3.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.0.feed_forward.w3.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ6optimizer.state.layers.0.attention_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ9optimizer.state.layers.0.attention_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ0optimizer.state.layers.0.ffn_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ3optimizer.state.layers.0.ffn_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ4optimizer.state.layers.1.attention.wq.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.1.attention.wq.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.1.attention.wk.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.1.attention.wk.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.1.attention.wv.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.1.attention.wv.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.1.attention.wo.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.1.attention.wo.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.1.feed_forward.w1.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.1.feed_forward.w1.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ7optimizer.state.layers.1.feed_forward.w2.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ:optimizer.state.layers.1.feed_forward.w2.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ7optimizer.state.layers.1.feed_forward.w3.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.1.feed_forward.w3.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ6optimizer.state.layers.1.attention_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ9optimizer.state.layers.1.attention_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ0optimizer.state.layers.1.ffn_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ3optimizer.state.layers.1.ffn_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ4optimizer.state.layers.2.attention.wq.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.2.attention.wq.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.2.attention.wk.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.2.attention.wk.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.2.attention.wv.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.2.attention.wv.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.2.attention.wo.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.2.attention.wo.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.2.feed_forward.w1.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.2.feed_forward.w1.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ7optimizer.state.layers.2.feed_forward.w2.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ:optimizer.state.layers.2.feed_forward.w2.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ7optimizer.state.layers.2.feed_forward.w3.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.2.feed_forward.w3.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ6optimizer.state.layers.2.attention_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ9optimizer.state.layers.2.attention_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ0optimizer.state.layers.2.ffn_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ3optimizer.state.layers.2.ffn_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ4optimizer.state.layers.3.attention.wq.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.3.attention.wq.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.3.attention.wk.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.3.attention.wk.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.3.attention.wv.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.3.attention.wv.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.3.attention.wo.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.3.attention.wo.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.3.feed_forward.w1.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.3.feed_forward.w1.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ7optimizer.state.layers.3.feed_forward.w2.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ:optimizer.state.layers.3.feed_forward.w2.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ7optimizer.state.layers.3.feed_forward.w3.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.3.feed_forward.w3.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ6optimizer.state.layers.3.attention_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ9optimizer.state.layers.3.attention_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ0optimizer.state.layers.3.ffn_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ3optimizer.state.layers.3.ffn_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ4optimizer.state.layers.4.attention.wq.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.4.attention.wq.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.4.attention.wk.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.4.attention.wk.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.4.attention.wv.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.4.attention.wv.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.4.attention.wo.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.4.attention.wo.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.4.feed_forward.w1.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.4.feed_forward.w1.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ7optimizer.state.layers.4.feed_forward.w2.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ:optimizer.state.layers.4.feed_forward.w2.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ7optimizer.state.layers.4.feed_forward.w3.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.4.feed_forward.w3.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ6optimizer.state.layers.4.attention_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ9optimizer.state.layers.4.attention_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ0optimizer.state.layers.4.ffn_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ3optimizer.state.layers.4.ffn_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ4optimizer.state.layers.5.attention.wq.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.5.attention.wq.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.5.attention.wk.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.5.attention.wk.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.5.attention.wv.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.5.attention.wv.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.5.attention.wo.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.5.attention.wo.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.5.feed_forward.w1.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.5.feed_forward.w1.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ7optimizer.state.layers.5.feed_forward.w2.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ:optimizer.state.layers.5.feed_forward.w2.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ7optimizer.state.layers.5.feed_forward.w3.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.5.feed_forward.w3.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ6optimizer.state.layers.5.attention_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ9optimizer.state.layers.5.attention_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ0optimizer.state.layers.5.ffn_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ3optimizer.state.layers.5.ffn_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ4optimizer.state.layers.6.attention.wq.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.6.attention.wq.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.6.attention.wk.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.6.attention.wk.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.6.attention.wv.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.6.attention.wv.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.6.attention.wo.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.6.attention.wo.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.6.feed_forward.w1.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.6.feed_forward.w1.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ7optimizer.state.layers.6.feed_forward.w2.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ:optimizer.state.layers.6.feed_forward.w2.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ7optimizer.state.layers.6.feed_forward.w3.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.6.feed_forward.w3.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ6optimizer.state.layers.6.attention_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ9optimizer.state.layers.6.attention_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ0optimizer.state.layers.6.ffn_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ3optimizer.state.layers.6.ffn_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ4optimizer.state.layers.7.attention.wq.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.7.attention.wq.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.7.attention.wk.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.7.attention.wk.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.7.attention.wv.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.7.attention.wv.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.7.attention.wo.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.7.attention.wo.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.7.feed_forward.w1.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.7.feed_forward.w1.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ7optimizer.state.layers.7.feed_forward.w2.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ:optimizer.state.layers.7.feed_forward.w2.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ7optimizer.state.layers.7.feed_forward.w3.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.7.feed_forward.w3.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!•2MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ6optimizer.state.layers.7.attention_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ9optimizer.state.layers.7.attention_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ0optimizer.state.layers.7.ffn_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ3optimizer.state.layers.7.ffn_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ4optimizer.state.layers.8.attention.wq.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.8.attention.wq.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.8.attention.wk.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.8.attention.wk.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.8.attention.wv.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.8.attention.wv.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.8.attention.wo.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.8.attention.wo.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.8.feed_forward.w1.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.8.feed_forward.w1.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ7optimizer.state.layers.8.feed_forward.w2.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ:optimizer.state.layers.8.feed_forward.w2.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ7optimizer.state.layers.8.feed_forward.w3.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.8.feed_forward.w3.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ6optimizer.state.layers.8.attention_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ9optimizer.state.layers.8.attention_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ0optimizer.state.layers.8.ffn_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ3optimizer.state.layers.8.ffn_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ4optimizer.state.layers.9.attention.wq.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.9.attention.wq.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.9.attention.wk.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.9.attention.wk.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.9.attention.wv.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.9.attention.wv.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ4optimizer.state.layers.9.attention.wo.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.9.attention.wo.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ7optimizer.state.layers.9.feed_forward.w1.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.9.feed_forward.w1.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ7optimizer.state.layers.9.feed_forward.w2.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ:optimizer.state.layers.9.feed_forward.w2.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ7optimizer.state.layers.9.feed_forward.w3.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ:optimizer.state.layers.9.feed_forward.w3.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ6optimizer.state.layers.9.attention_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ9optimizer.state.layers.9.attention_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ0optimizer.state.layers.9.ffn_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ3optimizer.state.layers.9.ffn_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ5optimizer.state.layers.10.attention.wq.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ8optimizer.state.layers.10.attention.wq.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ5optimizer.state.layers.10.attention.wk.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ8optimizer.state.layers.10.attention.wk.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ5optimizer.state.layers.10.attention.wv.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ8optimizer.state.layers.10.attention.wv.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ5optimizer.state.layers.10.attention.wo.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ8optimizer.state.layers.10.attention.wo.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ8optimizer.state.layers.10.feed_forward.w1.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ;optimizer.state.layers.10.feed_forward.w1.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ8optimizer.state.layers.10.feed_forward.w2.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ;optimizer.state.layers.10.feed_forward.w2.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ8optimizer.state.layers.10.feed_forward.w3.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ;optimizer.state.layers.10.feed_forward.w3.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ7optimizer.state.layers.10.attention_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ:optimizer.state.layers.10.attention_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ1optimizer.state.layers.10.ffn_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ4optimizer.state.layers.10.ffn_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ5optimizer.state.layers.11.attention.wq.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ8optimizer.state.layers.11.attention.wq.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ5optimizer.state.layers.11.attention.wk.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ8optimizer.state.layers.11.attention.wk.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ5optimizer.state.layers.11.attention.wv.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ8optimizer.state.layers.11.attention.wv.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ5optimizer.state.layers.11.attention.wo.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ8optimizer.state.layers.11.attention.wo.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM†”…”R”ubeubŒ8optimizer.state.layers.11.feed_forward.w1.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ;optimizer.state.layers.11.feed_forward.w1.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ8optimizer.state.layers.11.feed_forward.w2.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ;optimizer.state.layers.11.feed_forward.w2.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!MM †”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubh()”}”(h+h!MK†”…”R”h/h!MM †”…”R”ubeubŒ8optimizer.state.layers.11.feed_forward.w3.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ;optimizer.state.layers.11.feed_forward.w3.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!MÀK†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M€K†”…”R”h/h!MÀM†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!MÀM†”…”R”ubeubŒ7optimizer.state.layers.11.attention_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ:optimizer.state.layers.11.attention_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ1optimizer.state.layers.11.ffn_norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ4optimizer.state.layers.11.ffn_norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ#optimizer.state.norm.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ&optimizer.state.norm.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M…”…”R”h%]”(h()”}”(h+h!K…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubh()”}”(h+h!M…”…”R”h/h!M…”…”R”ubeubŒ%optimizer.state.output.weight.exp_avg”h )”}”(h h)”(h8h‰h‰t”bhh!M}M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!M@M†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!M@M†”…”R”ubh()”}”(h+h!M€>K†”…”R”h/h!M@M†”…”R”ubh()”}”(h+h!MÀ]K†”…”R”h/h!M@M†”…”R”ubeubŒ(optimizer.state.output.weight.exp_avg_sq”h )”}”(h h)”(h8h‰h‰t”bhh!M}M†”…”R”h%]”(h()”}”(h+h!KK†”…”R”h/h!M@M†”…”R”ubh()”}”(h+h!M@K†”…”R”h/h!M@M†”…”R”ubh()”}”(h+h!M€>K†”…”R”h/h!M@M†”…”R”ubh()”}”(h+h!MÀ]K†”…”R”h/h!M@M†”…”R”ubeubŒ*optimizer.state.tok_embeddings.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.0.attention.wv.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.0.feed_forward.w2.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ-optimizer.state.layers.0.ffn_norm.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.1.attention.wv.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.1.feed_forward.w2.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ-optimizer.state.layers.1.ffn_norm.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.2.attention.wv.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.2.feed_forward.w2.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ-optimizer.state.layers.2.ffn_norm.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.3.attention.wv.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.3.feed_forward.w2.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ-optimizer.state.layers.3.ffn_norm.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.4.attention.wv.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.4.feed_forward.w2.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ-optimizer.state.layers.4.ffn_norm.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.5.attention.wv.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.5.feed_forward.w2.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ-optimizer.state.layers.5.ffn_norm.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.6.attention.wv.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.6.feed_forward.w2.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ-optimizer.state.layers.6.ffn_norm.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.7.attention.wv.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.7.feed_forward.w2.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ-optimizer.state.layers.7.ffn_norm.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.8.attention.wv.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.8.feed_forward.w2.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ-optimizer.state.layers.8.ffn_norm.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.9.attention.wv.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.9.feed_forward.w2.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ-optimizer.state.layers.9.ffn_norm.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ2optimizer.state.layers.10.attention.wv.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ5optimizer.state.layers.10.feed_forward.w2.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ.optimizer.state.layers.10.ffn_norm.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ2optimizer.state.layers.11.attention.wv.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ5optimizer.state.layers.11.feed_forward.w2.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ.optimizer.state.layers.11.ffn_norm.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ/optimizer.param_groups.tok_embeddings.weight.lr”hŒBytesStorageMetadata”“”)”Œ9optimizer.param_groups.tok_embeddings.weight.weight_decay”j’7)”Œ5optimizer.param_groups.tok_embeddings.weight.maximize”j’7)”Œ2optimizer.param_groups.tok_embeddings.weight.fused”j’7)”Œ9optimizer.param_groups.layers.0.attention.wq.weight.betas”j’7)”Œ;optimizer.param_groups.layers.0.attention.wq.weight.amsgrad”j’7)”Œ>optimizer.param_groups.layers.0.attention.wq.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.0.attention.wq.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.0.attention.wk.weight.eps”j’7)”Œ;optimizer.param_groups.layers.0.attention.wk.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.0.attention.wk.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.0.attention.wv.weight.lr”j’7)”Œ@optimizer.param_groups.layers.0.attention.wv.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.0.attention.wo.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.0.attention.wo.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.0.feed_forward.w1.weight.eps”j’7)”Œ>optimizer.param_groups.layers.0.feed_forward.w1.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.0.feed_forward.w1.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.0.feed_forward.w2.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.0.feed_forward.w2.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.0.feed_forward.w2.weight.maximize”j’7)”Œoptimizer.param_groups.layers.0.feed_forward.w3.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.0.feed_forward.w3.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.0.feed_forward.w3.weight.initial_lr”j’7)”Œ9optimizer.param_groups.layers.0.attention_norm.weight.eps”j’7)”Œ=optimizer.param_groups.layers.0.attention_norm.weight.foreach”j’7)”ŒDoptimizer.param_groups.layers.0.attention_norm.weight.differentiable”j’7)”Œ2optimizer.param_groups.layers.0.ffn_norm.weight.lr”j’7)”Œoptimizer.param_groups.layers.1.attention.wq.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.1.attention.wq.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.1.attention.wk.weight.eps”j’7)”Œ;optimizer.param_groups.layers.1.attention.wk.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.1.attention.wk.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.1.attention.wv.weight.lr”j’7)”Œ@optimizer.param_groups.layers.1.attention.wv.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.1.attention.wo.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.1.attention.wo.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.1.feed_forward.w1.weight.eps”j’7)”Œ>optimizer.param_groups.layers.1.feed_forward.w1.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.1.feed_forward.w1.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.1.feed_forward.w2.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.1.feed_forward.w2.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.1.feed_forward.w2.weight.maximize”j’7)”Œoptimizer.param_groups.layers.1.feed_forward.w3.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.1.feed_forward.w3.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.1.feed_forward.w3.weight.initial_lr”j’7)”Œ9optimizer.param_groups.layers.1.attention_norm.weight.eps”j’7)”Œ=optimizer.param_groups.layers.1.attention_norm.weight.foreach”j’7)”ŒDoptimizer.param_groups.layers.1.attention_norm.weight.differentiable”j’7)”Œ2optimizer.param_groups.layers.1.ffn_norm.weight.lr”j’7)”Œoptimizer.param_groups.layers.2.attention.wq.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.2.attention.wq.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.2.attention.wk.weight.eps”j’7)”Œ;optimizer.param_groups.layers.2.attention.wk.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.2.attention.wk.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.2.attention.wv.weight.lr”j’7)”Œ@optimizer.param_groups.layers.2.attention.wv.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.2.attention.wo.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.2.attention.wo.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.2.feed_forward.w1.weight.eps”j’7)”Œ>optimizer.param_groups.layers.2.feed_forward.w1.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.2.feed_forward.w1.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.2.feed_forward.w2.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.2.feed_forward.w2.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.2.feed_forward.w2.weight.maximize”j’7)”Œoptimizer.param_groups.layers.2.feed_forward.w3.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.2.feed_forward.w3.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.2.feed_forward.w3.weight.initial_lr”j’7)”Œ9optimizer.param_groups.layers.2.attention_norm.weight.eps”j’7)”Œ=optimizer.param_groups.layers.2.attention_norm.weight.foreach”j’7)”ŒDoptimizer.param_groups.layers.2.attention_norm.weight.differentiable”j’7)”Œ2optimizer.param_groups.layers.2.ffn_norm.weight.lr”j’7)”Œoptimizer.param_groups.layers.3.attention.wq.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.3.attention.wq.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.3.attention.wk.weight.eps”j’7)”Œ;optimizer.param_groups.layers.3.attention.wk.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.3.attention.wk.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.3.attention.wv.weight.lr”j’7)”Œ@optimizer.param_groups.layers.3.attention.wv.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.3.attention.wo.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.3.attention.wo.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.3.feed_forward.w1.weight.eps”j’7)”Œ>optimizer.param_groups.layers.3.feed_forward.w1.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.3.feed_forward.w1.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.3.feed_forward.w2.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.3.feed_forward.w2.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.3.feed_forward.w2.weight.maximize”j’7)”Œoptimizer.param_groups.layers.3.feed_forward.w3.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.3.feed_forward.w3.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.3.feed_forward.w3.weight.initial_lr”j’7)”Œ9optimizer.param_groups.layers.3.attention_norm.weight.eps”j’7)”Œ=optimizer.param_groups.layers.3.attention_norm.weight.foreach”j’7)”ŒDoptimizer.param_groups.layers.3.attention_norm.weight.differentiable”j’7)”Œ2optimizer.param_groups.layers.3.ffn_norm.weight.lr”j’7)”Œoptimizer.param_groups.layers.4.attention.wq.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.4.attention.wq.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.4.attention.wk.weight.eps”j’7)”Œ;optimizer.param_groups.layers.4.attention.wk.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.4.attention.wk.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.4.attention.wv.weight.lr”j’7)”Œ@optimizer.param_groups.layers.4.attention.wv.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.4.attention.wo.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.4.attention.wo.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.4.feed_forward.w1.weight.eps”j’7)”Œ>optimizer.param_groups.layers.4.feed_forward.w1.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.4.feed_forward.w1.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.4.feed_forward.w2.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.4.feed_forward.w2.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.4.feed_forward.w2.weight.maximize”j’7)”Œoptimizer.param_groups.layers.4.feed_forward.w3.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.4.feed_forward.w3.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.4.feed_forward.w3.weight.initial_lr”j’7)”Œ9optimizer.param_groups.layers.4.attention_norm.weight.eps”j’7)”Œ=optimizer.param_groups.layers.4.attention_norm.weight.foreach”j’7)”ŒDoptimizer.param_groups.layers.4.attention_norm.weight.differentiable”j’7)”Œ2optimizer.param_groups.layers.4.ffn_norm.weight.lr”j’7)”Œoptimizer.param_groups.layers.5.attention.wq.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.5.attention.wq.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.5.attention.wk.weight.eps”j’7)”Œ;optimizer.param_groups.layers.5.attention.wk.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.5.attention.wk.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.5.attention.wv.weight.lr”j’7)”Œ@optimizer.param_groups.layers.5.attention.wv.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.5.attention.wo.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.5.attention.wo.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.5.feed_forward.w1.weight.eps”j’7)”Œ>optimizer.param_groups.layers.5.feed_forward.w1.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.5.feed_forward.w1.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.5.feed_forward.w2.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.5.feed_forward.w2.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.5.feed_forward.w2.weight.maximize”j’7)”Œoptimizer.param_groups.layers.5.feed_forward.w3.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.5.feed_forward.w3.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.5.feed_forward.w3.weight.initial_lr”j’7)”Œ9optimizer.param_groups.layers.5.attention_norm.weight.eps”j’7)”Œ=optimizer.param_groups.layers.5.attention_norm.weight.foreach”j’7)”ŒDoptimizer.param_groups.layers.5.attention_norm.weight.differentiable”j’7)”Œ2optimizer.param_groups.layers.5.ffn_norm.weight.lr”j’7)”Œoptimizer.param_groups.layers.6.attention.wq.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.6.attention.wq.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.6.attention.wk.weight.eps”j’7)”Œ;optimizer.param_groups.layers.6.attention.wk.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.6.attention.wk.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.6.attention.wv.weight.lr”j’7)”Œ@optimizer.param_groups.layers.6.attention.wv.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.6.attention.wo.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.6.attention.wo.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.6.feed_forward.w1.weight.eps”j’7)”Œ>optimizer.param_groups.layers.6.feed_forward.w1.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.6.feed_forward.w1.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.6.feed_forward.w2.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.6.feed_forward.w2.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.6.feed_forward.w2.weight.maximize”j’7)”Œoptimizer.param_groups.layers.6.feed_forward.w3.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.6.feed_forward.w3.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.6.feed_forward.w3.weight.initial_lr”j’7)”Œ9optimizer.param_groups.layers.6.attention_norm.weight.eps”j’7)”Œ=optimizer.param_groups.layers.6.attention_norm.weight.foreach”j’7)”ŒDoptimizer.param_groups.layers.6.attention_norm.weight.differentiable”j’7)”Œ2optimizer.param_groups.layers.6.ffn_norm.weight.lr”j’7)”Œoptimizer.param_groups.layers.7.attention.wq.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.7.attention.wq.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.7.attention.wk.weight.eps”j’7)”Œ;optimizer.param_groups.layers.7.attention.wk.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.7.attention.wk.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.7.attention.wv.weight.lr”j’7)”Œ@optimizer.param_groups.layers.7.attention.wv.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.7.attention.wo.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.7.attention.wo.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.7.feed_forward.w1.weight.eps”j’7)”Œ>optimizer.param_groups.layers.7.feed_forward.w1.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.7.feed_forward.w1.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.7.feed_forward.w2.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.7.feed_forward.w2.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.7.feed_forward.w2.weight.maximize”j’7)”Œoptimizer.param_groups.layers.7.feed_forward.w3.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.7.feed_forward.w3.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.7.feed_forward.w3.weight.initial_lr”j’7)”Œ9optimizer.param_groups.layers.7.attention_norm.weight.eps”j’7)”Œ=optimizer.param_groups.layers.7.attention_norm.weight.foreach”j’7)”ŒDoptimizer.param_groups.layers.7.attention_norm.weight.differentiable”j’7)”Œ2optimizer.param_groups.layers.7.ffn_norm.weight.lr”j’7)”Œoptimizer.param_groups.layers.8.attention.wq.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.8.attention.wq.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.8.attention.wk.weight.eps”j’7)”Œ;optimizer.param_groups.layers.8.attention.wk.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.8.attention.wk.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.8.attention.wv.weight.lr”j’7)”Œ@optimizer.param_groups.layers.8.attention.wv.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.8.attention.wo.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.8.attention.wo.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.8.feed_forward.w1.weight.eps”j’7)”Œ>optimizer.param_groups.layers.8.feed_forward.w1.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.8.feed_forward.w1.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.8.feed_forward.w2.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.8.feed_forward.w2.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.8.feed_forward.w2.weight.maximize”j’7)”Œoptimizer.param_groups.layers.8.feed_forward.w3.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.8.feed_forward.w3.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.8.feed_forward.w3.weight.initial_lr”j’7)”Œ9optimizer.param_groups.layers.8.attention_norm.weight.eps”j’7)”Œ=optimizer.param_groups.layers.8.attention_norm.weight.foreach”j’7)”ŒDoptimizer.param_groups.layers.8.attention_norm.weight.differentiable”j’7)”Œ2optimizer.param_groups.layers.8.ffn_norm.weight.lr”j’7)”Œoptimizer.param_groups.layers.9.attention.wq.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.9.attention.wq.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.9.attention.wk.weight.eps”j’7)”Œ;optimizer.param_groups.layers.9.attention.wk.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.9.attention.wk.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.9.attention.wv.weight.lr”j’7)”Œ@optimizer.param_groups.layers.9.attention.wv.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.9.attention.wo.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.9.attention.wo.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.9.feed_forward.w1.weight.eps”j’7)”Œ>optimizer.param_groups.layers.9.feed_forward.w1.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.9.feed_forward.w1.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.9.feed_forward.w2.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.9.feed_forward.w2.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.9.feed_forward.w2.weight.maximize”j’7)”Œoptimizer.param_groups.layers.9.feed_forward.w3.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.9.feed_forward.w3.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.9.feed_forward.w3.weight.initial_lr”j’7)”Œ9optimizer.param_groups.layers.9.attention_norm.weight.eps”j’7)”Œ=optimizer.param_groups.layers.9.attention_norm.weight.foreach”j’7)”ŒDoptimizer.param_groups.layers.9.attention_norm.weight.differentiable”j’7)”Œ2optimizer.param_groups.layers.9.ffn_norm.weight.lr”j’7)”Œoptimizer.param_groups.layers.10.attention_norm.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.10.attention_norm.weight.differentiable”j’7)”Œ3optimizer.param_groups.layers.10.ffn_norm.weight.lr”j’7)”Œ=optimizer.param_groups.layers.10.ffn_norm.weight.weight_decay”j’7)”Œ9optimizer.param_groups.layers.10.ffn_norm.weight.maximize”j’7)”Œ6optimizer.param_groups.layers.10.ffn_norm.weight.fused”j’7)”Œ:optimizer.param_groups.layers.11.attention.wq.weight.betas”j’7)”Œoptimizer.param_groups.layers.11.attention_norm.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.11.attention_norm.weight.differentiable”j’7)”Œ3optimizer.param_groups.layers.11.ffn_norm.weight.lr”j’7)”Œ=optimizer.param_groups.layers.11.ffn_norm.weight.weight_decay”j’7)”Œ9optimizer.param_groups.layers.11.ffn_norm.weight.maximize”j’7)”Œ6optimizer.param_groups.layers.11.ffn_norm.weight.fused”j’7)”Œ(optimizer.param_groups.norm.weight.betas”j’7)”Œ*optimizer.param_groups.norm.weight.amsgrad”j’7)”Œ-optimizer.param_groups.norm.weight.capturable”j’7)”Œ-optimizer.param_groups.norm.weight.initial_lr”j’7)”Œ(optimizer.param_groups.output.weight.eps”j’7)”Œ,optimizer.param_groups.output.weight.foreach”j’7)”Œ3optimizer.param_groups.output.weight.differentiable”j’7)”Œscheduler.base_lrs”j’7)”Œscheduler._step_count”j’7)”Œtraining_progress.total_tokens”j’7)”Œ1optimizer.state.layers.0.attention.wq.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.0.attention.wo.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.0.feed_forward.w3.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.1.attention.wq.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.1.attention.wo.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.1.feed_forward.w3.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.2.attention.wq.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.2.attention.wo.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.2.feed_forward.w3.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.3.attention.wq.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.3.attention.wo.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.3.feed_forward.w3.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.4.attention.wq.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.4.attention.wo.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.4.feed_forward.w3.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.5.attention.wq.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.5.attention.wo.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.5.feed_forward.w3.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.6.attention.wq.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.6.attention.wo.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.6.feed_forward.w3.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.7.attention.wq.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.7.attention.wo.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.7.feed_forward.w3.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.8.attention.wq.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.8.attention.wo.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.8.feed_forward.w3.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.9.attention.wq.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ1optimizer.state.layers.9.attention.wo.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ4optimizer.state.layers.9.feed_forward.w3.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ2optimizer.state.layers.10.attention.wq.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ2optimizer.state.layers.10.attention.wo.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ5optimizer.state.layers.10.feed_forward.w3.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ2optimizer.state.layers.11.attention.wq.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ2optimizer.state.layers.11.attention.wo.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ5optimizer.state.layers.11.feed_forward.w3.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ optimizer.state.norm.weight.step”h )”}”(h h)”(h8h‰h‰t”bhh!)…”R”h%]”h()”}”(h+h!)…”R”h/h!)…”R”ubaubŒ2optimizer.param_groups.tok_embeddings.weight.betas”j’7)”Œ4optimizer.param_groups.tok_embeddings.weight.amsgrad”j’7)”Œ7optimizer.param_groups.tok_embeddings.weight.capturable”j’7)”Œ7optimizer.param_groups.tok_embeddings.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.0.attention.wq.weight.eps”j’7)”Œ;optimizer.param_groups.layers.0.attention.wq.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.0.attention.wq.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.0.attention.wk.weight.lr”j’7)”Œ@optimizer.param_groups.layers.0.attention.wk.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.0.attention.wv.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.0.attention.wv.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.0.attention.wo.weight.eps”j’7)”Œ;optimizer.param_groups.layers.0.attention.wo.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.0.attention.wo.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.0.feed_forward.w1.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.0.feed_forward.w1.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.0.feed_forward.w1.weight.maximize”j’7)”Œoptimizer.param_groups.layers.0.feed_forward.w2.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.0.feed_forward.w2.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.0.feed_forward.w2.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.0.feed_forward.w3.weight.eps”j’7)”Œ>optimizer.param_groups.layers.0.feed_forward.w3.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.0.feed_forward.w3.weight.differentiable”j’7)”Œ8optimizer.param_groups.layers.0.attention_norm.weight.lr”j’7)”ŒBoptimizer.param_groups.layers.0.attention_norm.weight.weight_decay”j’7)”Œ>optimizer.param_groups.layers.0.attention_norm.weight.maximize”j’7)”Œ;optimizer.param_groups.layers.0.attention_norm.weight.fused”j’7)”Œ5optimizer.param_groups.layers.0.ffn_norm.weight.betas”j’7)”Œ7optimizer.param_groups.layers.0.ffn_norm.weight.amsgrad”j’7)”Œ:optimizer.param_groups.layers.0.ffn_norm.weight.capturable”j’7)”Œ:optimizer.param_groups.layers.0.ffn_norm.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.1.attention.wq.weight.eps”j’7)”Œ;optimizer.param_groups.layers.1.attention.wq.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.1.attention.wq.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.1.attention.wk.weight.lr”j’7)”Œ@optimizer.param_groups.layers.1.attention.wk.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.1.attention.wv.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.1.attention.wv.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.1.attention.wo.weight.eps”j’7)”Œ;optimizer.param_groups.layers.1.attention.wo.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.1.attention.wo.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.1.feed_forward.w1.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.1.feed_forward.w1.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.1.feed_forward.w1.weight.maximize”j’7)”Œoptimizer.param_groups.layers.1.feed_forward.w2.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.1.feed_forward.w2.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.1.feed_forward.w2.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.1.feed_forward.w3.weight.eps”j’7)”Œ>optimizer.param_groups.layers.1.feed_forward.w3.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.1.feed_forward.w3.weight.differentiable”j’7)”Œ8optimizer.param_groups.layers.1.attention_norm.weight.lr”j’7)”ŒBoptimizer.param_groups.layers.1.attention_norm.weight.weight_decay”j’7)”Œ>optimizer.param_groups.layers.1.attention_norm.weight.maximize”j’7)”Œ;optimizer.param_groups.layers.1.attention_norm.weight.fused”j’7)”Œ5optimizer.param_groups.layers.1.ffn_norm.weight.betas”j’7)”Œ7optimizer.param_groups.layers.1.ffn_norm.weight.amsgrad”j’7)”Œ:optimizer.param_groups.layers.1.ffn_norm.weight.capturable”j’7)”Œ:optimizer.param_groups.layers.1.ffn_norm.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.2.attention.wq.weight.eps”j’7)”Œ;optimizer.param_groups.layers.2.attention.wq.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.2.attention.wq.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.2.attention.wk.weight.lr”j’7)”Œ@optimizer.param_groups.layers.2.attention.wk.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.2.attention.wv.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.2.attention.wv.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.2.attention.wo.weight.eps”j’7)”Œ;optimizer.param_groups.layers.2.attention.wo.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.2.attention.wo.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.2.feed_forward.w1.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.2.feed_forward.w1.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.2.feed_forward.w1.weight.maximize”• j’7)”Œoptimizer.param_groups.layers.2.feed_forward.w2.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.2.feed_forward.w2.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.2.feed_forward.w2.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.2.feed_forward.w3.weight.eps”j’7)”Œ>optimizer.param_groups.layers.2.feed_forward.w3.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.2.feed_forward.w3.weight.differentiable”j’7)”Œ8optimizer.param_groups.layers.2.attention_norm.weight.lr”j’7)”ŒBoptimizer.param_groups.layers.2.attention_norm.weight.weight_decay”j’7)”Œ>optimizer.param_groups.layers.2.attention_norm.weight.maximize”j’7)”Œ;optimizer.param_groups.layers.2.attention_norm.weight.fused”j’7)”Œ5optimizer.param_groups.layers.2.ffn_norm.weight.betas”j’7)”Œ7optimizer.param_groups.layers.2.ffn_norm.weight.amsgrad”j’7)”Œ:optimizer.param_groups.layers.2.ffn_norm.weight.capturable”j’7)”Œ:optimizer.param_groups.layers.2.ffn_norm.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.3.attention.wq.weight.eps”j’7)”Œ;optimizer.param_groups.layers.3.attention.wq.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.3.attention.wq.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.3.attention.wk.weight.lr”j’7)”Œ@optimizer.param_groups.layers.3.attention.wk.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.3.attention.wv.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.3.attention.wv.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.3.attention.wo.weight.eps”j’7)”Œ;optimizer.param_groups.layers.3.attention.wo.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.3.attention.wo.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.3.feed_forward.w1.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.3.feed_forward.w1.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.3.feed_forward.w1.weight.maximize”j’7)”Œoptimizer.param_groups.layers.3.feed_forward.w2.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.3.feed_forward.w2.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.3.feed_forward.w2.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.3.feed_forward.w3.weight.eps”j’7)”Œ>optimizer.param_groups.layers.3.feed_forward.w3.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.3.feed_forward.w3.weight.differentiable”j’7)”Œ8optimizer.param_groups.layers.3.attention_norm.weight.lr”j’7)”ŒBoptimizer.param_groups.layers.3.attention_norm.weight.weight_decay”j’7)”Œ>optimizer.param_groups.layers.3.attention_norm.weight.maximize”j’7)”Œ;optimizer.param_groups.layers.3.attention_norm.weight.fused”j’7)”Œ5optimizer.param_groups.layers.3.ffn_norm.weight.betas”j’7)”Œ7optimizer.param_groups.layers.3.ffn_norm.weight.amsgrad”j’7)”Œ:optimizer.param_groups.layers.3.ffn_norm.weight.capturable”j’7)”Œ:optimizer.param_groups.layers.3.ffn_norm.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.4.attention.wq.weight.eps”j’7)”Œ;optimizer.param_groups.layers.4.attention.wq.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.4.attention.wq.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.4.attention.wk.weight.lr”j’7)”Œ@optimizer.param_groups.layers.4.attention.wk.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.4.attention.wv.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.4.attention.wv.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.4.attention.wo.weight.eps”j’7)”Œ;optimizer.param_groups.layers.4.attention.wo.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.4.attention.wo.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.4.feed_forward.w1.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.4.feed_forward.w1.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.4.feed_forward.w1.weight.maximize”j’7)”Œoptimizer.param_groups.layers.4.feed_forward.w2.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.4.feed_forward.w2.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.4.feed_forward.w2.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.4.feed_forward.w3.weight.eps”j’7)”Œ>optimizer.param_groups.layers.4.feed_forward.w3.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.4.feed_forward.w3.weight.differentiable”j’7)”Œ8optimizer.param_groups.layers.4.attention_norm.weight.lr”j’7)”ŒBoptimizer.param_groups.layers.4.attention_norm.weight.weight_decay”j’7)”Œ>optimizer.param_groups.layers.4.attention_norm.weight.maximize”j’7)”Œ;optimizer.param_groups.layers.4.attention_norm.weight.fused”j’7)”Œ5optimizer.param_groups.layers.4.ffn_norm.weight.betas”j’7)”Œ7optimizer.param_groups.layers.4.ffn_norm.weight.amsgrad”j’7)”Œ:optimizer.param_groups.layers.4.ffn_norm.weight.capturable”j’7)”Œ:optimizer.param_groups.layers.4.ffn_norm.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.5.attention.wq.weight.eps”j’7)”Œ;optimizer.param_groups.layers.5.attention.wq.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.5.attention.wq.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.5.attention.wk.weight.lr”j’7)”Œ@optimizer.param_groups.layers.5.attention.wk.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.5.attention.wv.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.5.attention.wv.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.5.attention.wo.weight.eps”j’7)”Œ;optimizer.param_groups.layers.5.attention.wo.weight.foreach”j’7)”u(ŒBoptimizer.param_groups.layers.5.attention.wo.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.5.feed_forward.w1.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.5.feed_forward.w1.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.5.feed_forward.w1.weight.maximize”j’7)”Œoptimizer.param_groups.layers.5.feed_forward.w2.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.5.feed_forward.w2.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.5.feed_forward.w2.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.5.feed_forward.w3.weight.eps”j’7)”Œ>optimizer.param_groups.layers.5.feed_forward.w3.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.5.feed_forward.w3.weight.differentiable”j’7)”Œ8optimizer.param_groups.layers.5.attention_norm.weight.lr”j’7)”ŒBoptimizer.param_groups.layers.5.attention_norm.weight.weight_decay”j’7)”Œ>optimizer.param_groups.layers.5.attention_norm.weight.maximize”j’7)”Œ;optimizer.param_groups.layers.5.attention_norm.weight.fused”j’7)”Œ5optimizer.param_groups.layers.5.ffn_norm.weight.betas”j’7)”Œ7optimizer.param_groups.layers.5.ffn_norm.weight.amsgrad”j’7)”Œ:optimizer.param_groups.layers.5.ffn_norm.weight.capturable”j’7)”Œ:optimizer.param_groups.layers.5.ffn_norm.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.6.attention.wq.weight.eps”j’7)”Œ;optimizer.param_groups.layers.6.attention.wq.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.6.attention.wq.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.6.attention.wk.weight.lr”j’7)”Œ@optimizer.param_groups.layers.6.attention.wk.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.6.attention.wv.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.6.attention.wv.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.6.attention.wo.weight.eps”j’7)”Œ;optimizer.param_groups.layers.6.attention.wo.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.6.attention.wo.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.6.feed_forward.w1.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.6.feed_forward.w1.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.6.feed_forward.w1.weight.maximize”j’7)”Œoptimizer.param_groups.layers.6.feed_forward.w2.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.6.feed_forward.w2.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.6.feed_forward.w2.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.6.feed_forward.w3.weight.eps”j’7)”Œ>optimizer.param_groups.layers.6.feed_forward.w3.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.6.feed_forward.w3.weight.differentiable”j’7)”Œ8optimizer.param_groups.layers.6.attention_norm.weight.lr”j’7)”ŒBoptimizer.param_groups.layers.6.attention_norm.weight.weight_decay”j’7)”Œ>optimizer.param_groups.layers.6.attention_norm.weight.maximize”j’7)”Œ;optimizer.param_groups.layers.6.attention_norm.weight.fused”j’7)”Œ5optimizer.param_groups.layers.6.ffn_norm.weight.betas”j’7)”Œ7optimizer.param_groups.layers.6.ffn_norm.weight.amsgrad”j’7)”Œ:optimizer.param_groups.layers.6.ffn_norm.weight.capturable”j’7)”Œ:optimizer.param_groups.layers.6.ffn_norm.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.7.attention.wq.weight.eps”j’7)”Œ;optimizer.param_groups.layers.7.attention.wq.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.7.attention.wq.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.7.attention.wk.weight.lr”j’7)”Œ@optimizer.param_groups.layers.7.attention.wk.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.7.attention.wv.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.7.attention.wv.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.7.attention.wo.weight.eps”j’7)”Œ;optimizer.param_groups.layers.7.attention.wo.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.7.attention.wo.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.7.feed_forward.w1.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.7.feed_forward.w1.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.7.feed_forward.w1.weight.maximize”j’7)”Œoptimizer.param_groups.layers.7.feed_forward.w2.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.7.feed_forward.w2.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.7.feed_forward.w2.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.7.feed_forward.w3.weight.eps”j’7)”Œ>optimizer.param_groups.layers.7.feed_forward.w3.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.7.feed_forward.w3.weight.differentiable”j’7)”Œ8optimizer.param_groups.layers.7.attention_norm.weight.lr”j’7)”ŒBoptimizer.param_groups.layers.7.attention_norm.weight.weight_decay”j’7)”Œ>optimizer.param_groups.layers.7.attention_norm.weight.maximize”j’7)”Œ;optimizer.param_groups.layers.7.attention_norm.weight.fused”j’7)”Œ5optimizer.param_groups.layers.7.ffn_norm.weight.betas”j’7)”Œ7optimizer.param_groups.layers.7.ffn_norm.weight.amsgrad”j’7)”Œ:optimizer.param_groups.layers.7.ffn_norm.weight.capturable”j’7)”Œ:optimizer.param_groups.layers.7.ffn_norm.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.8.attention.wq.weight.eps”j’7)”Œ;optimizer.param_groups.layers.8.attention.wq.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.8.attention.wq.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.8.attention.wk.weight.lr”j’7)”Œ@optimizer.param_groups.layers.8.attention.wk.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.8.attention.wv.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.8.attention.wv.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.8.attention.wo.weight.eps”j’7)”Œ;optimizer.param_groups.layers.8.attention.wo.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.8.attention.wo.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.8.feed_forward.w1.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.8.feed_forward.w1.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.8.feed_forward.w1.weight.maximize”j’7)”Œoptimizer.param_groups.layers.8.feed_forward.w2.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.8.feed_forward.w2.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.8.feed_forward.w2.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.8.feed_forward.w3.weight.eps”j’7)”Œ>optimizer.param_groups.layers.8.feed_forward.w3.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.8.feed_forward.w3.weight.differentiable”j’7)”Œ8optimizer.param_groups.layers.8.attention_norm.weight.lr”j’7)”ŒBoptimizer.param_groups.layers.8.attention_norm.weight.weight_decay”j’7)”Œ>optimizer.param_groups.layers.8.attention_norm.weight.maximize”j’7)”Œ;optimizer.param_groups.layers.8.attention_norm.weight.fused”j’7)”Œ5optimizer.param_groups.layers.8.ffn_norm.weight.betas”j’7)”Œ7optimizer.param_groups.layers.8.ffn_norm.weight.amsgrad”j’7)”Œ:optimizer.param_groups.layers.8.ffn_norm.weight.capturable”j’7)”Œ:optimizer.param_groups.layers.8.ffn_norm.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.9.attention.wq.weight.eps”j’7)”Œ;optimizer.param_groups.layers.9.attention.wq.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.9.attention.wq.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.9.attention.wk.weight.lr”j’7)”Œ@optimizer.param_groups.layers.9.attention.wk.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.9.attention.wv.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.9.attention.wv.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.9.attention.wo.weight.eps”j’7)”Œ;optimizer.param_groups.layers.9.attention.wo.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.9.attention.wo.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.9.feed_forward.w1.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.9.feed_forward.w1.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.9.feed_forward.w1.weight.maximize”j’7)”Œoptimizer.param_groups.layers.9.feed_forward.w2.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.9.feed_forward.w2.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.9.feed_forward.w2.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.9.feed_forward.w3.weight.eps”j’7)”Œ>optimizer.param_groups.layers.9.feed_forward.w3.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.9.feed_forward.w3.weight.differentiable”j’7)”Œ8optimizer.param_groups.layers.9.attention_norm.weight.lr”j’7)”ŒBoptimizer.param_groups.layers.9.attention_norm.weight.weight_decay”j’7)”Œ>optimizer.param_groups.layers.9.attention_norm.weight.maximize”j’7)”Œ;optimizer.param_groups.layers.9.attention_norm.weight.fused”j’7)”Œ5optimizer.param_groups.layers.9.ffn_norm.weight.betas”j’7)”Œ7optimizer.param_groups.layers.9.ffn_norm.weight.amsgrad”j’7)”Œ:optimizer.param_groups.layers.9.ffn_norm.weight.capturable”j’7)”Œ:optimizer.param_groups.layers.9.ffn_norm.weight.initial_lr”j’7)”Œ8optimizer.param_groups.layers.10.attention.wq.weight.eps”j’7)”Œoptimizer.param_groups.layers.0.attention.wk.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.0.attention.wk.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.0.attention.wv.weight.eps”j’7)”Œ;optimizer.param_groups.layers.0.attention.wv.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.0.attention.wv.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.0.attention.wo.weight.lr”j’7)”Œ@optimizer.param_groups.layers.0.attention.wo.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.0.feed_forward.w1.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.0.feed_forward.w1.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.0.feed_forward.w1.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.0.feed_forward.w2.weight.eps”j’7)”Œ>optimizer.param_groups.layers.0.feed_forward.w2.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.0.feed_forward.w2.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.0.feed_forward.w3.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.0.feed_forward.w3.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.0.feed_forward.w3.weight.maximize”j’7)”Œoptimizer.param_groups.layers.0.ffn_norm.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.1.attention.wq.weight.lr”j’7)”Œ@optimizer.param_groups.layers.1.attention.wq.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.1.attention.wk.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.1.attention.wk.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.1.attention.wv.weight.eps”j’7)”Œ;optimizer.param_groups.layers.1.attention.wv.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.1.attention.wv.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.1.attention.wo.weight.lr”j’7)”Œ@optimizer.param_groups.layers.1.attention.wo.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.1.feed_forward.w1.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.1.feed_forward.w1.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.1.feed_forward.w1.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.1.feed_forward.w2.weight.eps”j’7)”Œ>optimizer.param_groups.layers.1.feed_forward.w2.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.1.feed_forward.w2.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.1.feed_forward.w3.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.1.feed_forward.w3.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.1.feed_forward.w3.weight.maximize”j’7)”Œoptimizer.param_groups.layers.1.ffn_norm.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.2.attention.wq.weight.lr”j’7)”Œ@optimizer.param_groups.layers.2.attention.wq.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.2.attention.wk.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.2.attention.wk.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.2.attention.wv.weight.eps”j’7)”Œ;optimizer.param_groups.layers.2.attention.wv.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.2.attention.wv.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.2.attention.wo.weight.lr”j’7)”Œ@optimizer.param_groups.layers.2.attention.wo.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.2.feed_forward.w1.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.2.feed_forward.w1.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.2.feed_forward.w1.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.2.feed_forward.w2.weight.eps”j’7)”Œ>optimizer.param_groups.layers.2.feed_forward.w2.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.2.feed_forward.w2.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.2.feed_forward.w3.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.2.feed_forward.w3.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.2.feed_forward.w3.weight.maximize”j’7)”Œoptimizer.param_groups.layers.2.ffn_norm.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.3.attention.wq.weight.lr”j’7)”Œ@optimizer.param_groups.layers.3.attention.wq.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.3.attention.wk.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.3.attention.wk.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.3.attention.wv.weight.eps”j’7)”Œ;optimizer.param_groups.layers.3.attention.wv.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.3.attention.wv.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.3.attention.wo.weight.lr”j’7)”Œ@optimizer.param_groups.layers.3.attention.wo.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.3.feed_forward.w1.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.3.feed_forward.w1.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.3.feed_forward.w1.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.3.feed_forward.w2.weight.eps”j’7)”Œ>optimizer.param_groups.layers.3.feed_forward.w2.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.3.feed_forward.w2.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.3.feed_forward.w3.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.3.feed_forward.w3.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.3.feed_forward.w3.weight.maximize”j’7)”Œoptimizer.param_groups.layers.3.ffn_norm.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.4.attention.wq.weight.lr”j’7)”Œ@optimizer.param_groups.layers.4.attention.wq.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.4.attention.wk.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.4.attention.wk.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.4.attention.wv.weight.eps”j’7)”Œ;optimizer.param_groups.layers.4.attention.wv.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.4.attention.wv.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.4.attention.wo.weight.lr”j’7)”Œ@optimizer.param_groups.layers.4.attention.wo.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.4.feed_forward.w1.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.4.feed_forward.w1.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.4.feed_forward.w1.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.4.feed_forward.w2.weight.eps”j’7)”Œ>optimizer.param_groups.layers.4.feed_forward.w2.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.4.feed_forward.w2.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.4.feed_forward.w3.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.4.feed_forward.w3.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.4.feed_forward.w3.weight.maximize”j’7)”Œoptimizer.param_groups.layers.4.ffn_norm.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.5.attention.wq.weight.lr”j’7)”Œ@optimizer.param_groups.layers.5.attention.wq.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.5.attention.wk.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.5.attention.wk.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.5.attention.wv.weight.eps”j’7)”Œ;optimizer.param_groups.layers.5.attention.wv.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.5.attention.wv.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.5.attention.wo.weight.lr”j’7)”Œ@optimizer.param_groups.layers.5.attention.wo.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.5.feed_forward.w1.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.5.feed_forward.w1.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.5.feed_forward.w1.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.5.feed_forward.w2.weight.eps”j’7)”Œ>optimizer.param_groups.layers.5.feed_forward.w2.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.5.feed_forward.w2.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.5.feed_forward.w3.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.5.feed_forward.w3.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.5.feed_forward.w3.weight.maximize”j’7)”Œoptimizer.param_groups.layers.5.ffn_norm.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.6.attention.wq.weight.lr”j’7)”Œ@optimizer.param_groups.layers.6.attention.wq.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.6.attention.wk.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.6.attention.wk.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.6.attention.wv.weight.eps”j’7)”Œ;optimizer.param_groups.layers.6.attention.wv.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.6.attention.wv.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.6.attention.wo.weight.lr”j’7)”Œ@optimizer.param_groups.layers.6.attention.wo.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.6.feed_forward.w1.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.6.feed_forward.w1.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.6.feed_forward.w1.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.6.feed_forward.w2.weight.eps”j’7)”Œ>optimizer.param_groups.layers.6.feed_forward.w2.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.6.feed_forward.w2.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.6.feed_forward.w3.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.6.feed_forward.w3.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.6.feed_forward.w3.weight.maximize”j’7)”Œoptimizer.param_groups.layers.6.ffn_norm.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.7.attention.wq.weight.lr”j’7)”Œ@optimizer.param_groups.layers.7.attention.wq.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.7.attention.wk.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.7.attention.wk.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.7.attention.wv.weight.eps”j’7)”Œ;optimizer.param_groups.layers.7.attention.wv.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.7.attention.wv.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.7.attention.wo.weight.lr”j’7)”Œ@optimizer.param_groups.layers.7.attention.wo.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.7.feed_forward.w1.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.7.feed_forward.w1.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.7.feed_forward.w1.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.7.feed_forward.w2.weight.eps”j’7)”Œ>optimizer.param_groups.layers.7.feed_forward.w2.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.7.feed_forward.w2.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.7.feed_forward.w3.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.7.feed_forward.w3.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.7.feed_forward.w3.weight.maximize”j’7)”Œoptimizer.param_groups.layers.7.ffn_norm.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.8.attention.wq.weight.lr”j’7)”Œ@optimizer.param_groups.layers.8.attention.wq.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.8.attention.wk.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.8.attention.wk.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.8.attention.wv.weight.eps”j’7)”Œ;optimizer.param_groups.layers.8.attention.wv.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.8.attention.wv.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.8.attention.wo.weight.lr”j’7)”Œ@optimizer.param_groups.layers.8.attention.wo.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.8.feed_forward.w1.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.8.feed_forward.w1.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.8.feed_forward.w1.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.8.feed_forward.w2.weight.eps”j’7)”Œ>optimizer.param_groups.layers.8.feed_forward.w2.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.8.feed_forward.w2.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.8.feed_forward.w3.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.8.feed_forward.w3.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.8.feed_forward.w3.weight.maximize”j’7)”Œoptimizer.param_groups.layers.8.ffn_norm.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.9.attention.wq.weight.lr”j’7)”Œ@optimizer.param_groups.layers.9.attention.wq.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.9.attention.wk.weight.capturable”j’7)”Œ>optimizer.param_groups.layers.9.attention.wk.weight.initial_lr”j’7)”Œ7optimizer.param_groups.layers.9.attention.wv.weight.eps”j’7)”Œ;optimizer.param_groups.layers.9.attention.wv.weight.foreach”j’7)”ŒBoptimizer.param_groups.layers.9.attention.wv.weight.differentiable”j’7)”Œ6optimizer.param_groups.layers.9.attention.wo.weight.lr”j’7)”Œ@optimizer.param_groups.layers.9.attention.wo.weight.weight_decay”j’7)”Œoptimizer.param_groups.layers.9.feed_forward.w1.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.9.feed_forward.w1.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.9.feed_forward.w1.weight.initial_lr”j’7)”Œ:optimizer.param_groups.layers.9.feed_forward.w2.weight.eps”j’7)”Œ>optimizer.param_groups.layers.9.feed_forward.w2.weight.foreach”j’7)”ŒEoptimizer.param_groups.layers.9.feed_forward.w2.weight.differentiable”j’7)”Œ9optimizer.param_groups.layers.9.feed_forward.w3.weight.lr”j’7)”ŒCoptimizer.param_groups.layers.9.feed_forward.w3.weight.weight_decay”j’7)”Œ?optimizer.param_groups.layers.9.feed_forward.w3.weight.maximize”j’7)”Œoptimizer.param_groups.layers.9.ffn_norm.weight.differentiable”j’7)”Œ7optimizer.param_groups.layers.10.attention.wq.weight.lr”j’7)”ŒAoptimizer.param_groups.layers.10.attention.wq.weight.weight_decay”j’7)”Œ=optimizer.param_groups.layers.10.attention.wq.weight.maximize”j’7)”Œ:optimizer.param_groups.layers.10.attention.wq.weight.fused”j’7)”Œ:optimizer.param_groups.layers.10.attention.wk.weight.betas”j’7)”Œoptimizer.param_groups.layers.10.attention_norm.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.10.attention_norm.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.10.attention_norm.weight.initial_lr”j’7)”Œ4optimizer.param_groups.layers.10.ffn_norm.weight.eps”j’7)”Œ8optimizer.param_groups.layers.10.ffn_norm.weight.foreach”j’7)”Œ?optimizer.param_groups.layers.10.ffn_norm.weight.differentiable”j’7)”Œ7optimizer.param_groups.layers.11.attention.wq.weight.lr”j’7)”ŒAoptimizer.param_groups.layers.11.attention.wq.weight.weight_decay”j’7)”Œ=optimizer.param_groups.layers.11.attention.wq.weight.maximize”j’7)”Œ:optimizer.param_groups.layers.11.attention.wq.weight.fused”j’7)”Œ:optimizer.param_groups.layers.11.attention.wk.weight.betas”j’7)”Œoptimizer.param_groups.layers.11.attention_norm.weight.amsgrad”j’7)”ŒAoptimizer.param_groups.layers.11.attention_norm.weight.capturable”j’7)”ŒAoptimizer.param_groups.layers.11.attention_norm.weight.initial_lr”j’7)”Œ4optimizer.param_groups.layers.11.ffn_norm.weight.eps”j’7)”Œ8optimizer.param_groups.layers.11.ffn_norm.weight.foreach”j’7)”Œ?optimizer.param_groups.layers.11.ffn_norm.weight.differentiable”j’7)”Œ%optimizer.param_groups.norm.weight.lr”j’7)”Œ/optimizer.param_groups.norm.weight.weight_decay”j’7)”Œ+optimizer.param_groups.norm.weight.maximize”j’7)”Œ(optimizer.param_groups.norm.weight.fused”j’7)”Œ*optimizer.param_groups.output.weight.betas”j’7)”Œ,optimizer.param_groups.output.weight.amsgrad”j’7)”Œ/optimizer.param_groups.output.weight.capturable”j’7)”Œ/optimizer.param_groups.output.weight.initial_lr”j’7)”Œscheduler.verbose”j’7)”Œscheduler._last_lr”j’7)”Œtraining_progress.step”j’7)”uŒ planner_data”}”(Œmodel.freqs_cis”Œmodel”Œ freqs_cis”†”Œmodel.tok_embeddings.weight”j=EŒtok_embeddings.weight”†”Œ"model.layers.0.attention.wq.weight”j=EŒlayers.0.attention.wq.weight”†”Œ"model.layers.0.attention.wk.weight”j=EŒlayers.0.attention.wk.weight”†”Œ"model.layers.0.attention.wv.weight”j=EŒlayers.0.attention.wv.weight”†”Œ"model.layers.0.attention.wo.weight”j=EŒlayers.0.attention.wo.weight”†”Œ%model.layers.0.feed_forward.w1.weight”j=EŒlayers.0.feed_forward.w1.weight”†”Œ%model.layers.0.feed_forward.w2.weight”j=EŒlayers.0.feed_forward.w2.weight”†”Œ%model.layers.0.feed_forward.w3.weight”j=EŒlayers.0.feed_forward.w3.weight”†”Œ$model.layers.0.attention_norm.weight”j=EŒlayers.0.attention_norm.weight”†”Œmodel.layers.0.ffn_norm.weight”j=EŒlayers.0.ffn_norm.weight”†”Œ"model.layers.1.attention.wq.weight”j=EŒlayers.1.attention.wq.weight”†”Œ"model.layers.1.attention.wk.weight”j=EŒlayers.1.attention.wk.weight”†”Œ"model.layers.1.attention.wv.weight”j=EŒlayers.1.attention.wv.weight”†”Œ"model.layers.1.attention.wo.weight”j=EŒlayers.1.attention.wo.weight”†”Œ%model.layers.1.feed_forward.w1.weight”j=EŒlayers.1.feed_forward.w1.weight”†”Œ%model.layers.1.feed_forward.w2.weight”j=EŒlayers.1.feed_forward.w2.weight”†”Œ%model.layers.1.feed_forward.w3.weight”j=EŒlayers.1.feed_forward.w3.weight”†”Œ$model.layers.1.attention_norm.weight”j=EŒlayers.1.attention_norm.weight”†”Œmodel.layers.1.ffn_norm.weight”j=EŒlayers.1.ffn_norm.weight”†”Œ"model.layers.2.attention.wq.weight”j=EŒlayers.2.attention.wq.weight”†”Œ"model.layers.2.attention.wk.weight”j=EŒlayers.2.attention.wk.weight”†”Œ"model.layers.2.attention.wv.weight”j=EŒlayers.2.attention.wv.weight”†”Œ"model.layers.2.attention.wo.weight”j=EŒlayers.2.attention.wo.weight”†”Œ%model.layers.2.feed_forward.w1.weight”j=EŒlayers.2.feed_forward.w1.weight”†”Œ%model.layers.2.feed_forward.w2.weight”j=EŒlayers.2.feed_forward.w2.weight”†”Œ%model.layers.2.feed_forward.w3.weight”j=EŒlayers.2.feed_forward.w3.weight”†”Œ$model.layers.2.attention_norm.weight”j=EŒlayers.2.attention_norm.weight”†”Œmodel.layers.2.ffn_norm.weight”j=EŒlayers.2.ffn_norm.weight”†”Œ"model.layers.3.attention.wq.weight”j=EŒlayers.3.attention.wq.weight”†”Œ"model.layers.3.attention.wk.weight”j=EŒlayers.3.attention.wk.weight”†”Œ"model.layers.3.attention.wv.weight”j=EŒlayers.3.attention.wv.weight”†”Œ"model.layers.3.attention.wo.weight”j=EŒlayers.3.attention.wo.weight”†”Œ%model.layers.3.feed_forward.w1.weight”j=EŒlayers.3.feed_forward.w1.weight”†”Œ%model.layers.3.feed_forward.w2.weight”j=EŒlayers.3.feed_forward.w2.weight”†”Œ%model.layers.3.feed_forward.w3.weight”j=EŒlayers.3.feed_forward.w3.weight”†”Œ$model.layers.3.attention_norm.weight”j=EŒlayers.3.attention_norm.weight”†”Œmodel.layers.3.ffn_norm.weight”j=EŒlayers.3.ffn_norm.weight”†”Œ"model.layers.4.attention.wq.weight”j=EŒlayers.4.attention.wq.weight”†”Œ"model.layers.4.attention.wk.weight”j=EŒlayers.4.attention.wk.weight”†”Œ"model.layers.4.attention.wv.weight”j=EŒlayers.4.attention.wv.weight”†”Œ"model.layers.4.attention.wo.weight”j=EŒlayers.4.attention.wo.weight”†”Œ%model.layers.4.feed_forward.w1.weight”j=EŒlayers.4.feed_forward.w1.weight”†”Œ%model.layers.4.feed_forward.w2.weight”j=EŒlayers.4.feed_forward.w2.weight”†”Œ%model.layers.4.feed_forward.w3.weight”j=EŒlayers.4.feed_forward.w3.weight”†”Œ$model.layers.4.attention_norm.weight”j=EŒlayers.4.attention_norm.weight”†”Œmodel.layers.4.ffn_norm.weight”j=EŒlayers.4.ffn_norm.weight”†”Œ"model.layers.5.attention.wq.weight”j=EŒlayers.5.attention.wq.weight”†”Œ"model.layers.5.attention.wk.weight”j=EŒlayers.5.attention.wk.weight”†”Œ"model.layers.5.attention.wv.weight”j=EŒlayers.5.attention.wv.weight”†”Œ"model.layers.5.attention.wo.weight”j=EŒlayers.5.attention.wo.weight”†”Œ%model.layers.5.feed_forward.w1.weight”j=EŒlayers.5.feed_forward.w1.weight”†”Œ%model.layers.5.feed_forward.w2.weight”j=EŒlayers.5.feed_forward.w2.weight”†”Œ%model.layers.5.feed_forward.w3.weight”j=EŒlayers.5.feed_forward.w3.weight”†”Œ$model.layers.5.attention_norm.weight”j=EŒlayers.5.attention_norm.weight”†”Œmodel.layers.5.ffn_norm.weight”j=EŒlayers.5.ffn_norm.weight”†”Œ"model.layers.6.attention.wq.weight”j=EŒlayers.6.attention.wq.weight”†”Œ"model.layers.6.attention.wk.weight”j=EŒlayers.6.attention.wk.weight”†”Œ"model.layers.6.attention.wv.weight”j=EŒlayers.6.attention.wv.weight”†”Œ"model.layers.6.attention.wo.weight”j=EŒlayers.6.attention.wo.weight”†”Œ%model.layers.6.feed_forward.w1.weight”j=EŒlayers.6.feed_forward.w1.weight”†”Œ%model.layers.6.feed_forward.w2.weight”j=EŒlayers.6.feed_forward.w2.weight”†”Œ%model.layers.6.feed_forward.w3.weight”j=EŒlayers.6.feed_forward.w3.weight”†”Œ$model.layers.6.attention_norm.weight”j=EŒlayers.6.attention_norm.weight”†”Œmodel.layers.6.ffn_norm.weight”j=EŒlayers.6.ffn_norm.weight”†”Œ"model.layers.7.attention.wq.weight”j=EŒlayers.7.attention.wq.weight”†”Œ"model.layers.7.attention.wk.weight”j=EŒlayers.7.attention.wk.weight”†”Œ"model.layers.7.attention.wv.weight”j=EŒlayers.7.attention.wv.weight”†”Œ"model.layers.7.attention.wo.weight”j=EŒlayers.7.attention.wo.weight”†”Œ%model.layers.7.feed_forward.w1.weight”j=EŒlayers.7.feed_forward.w1.weight”†”Œ%model.layers.7.feed_forward.w2.weight”j=EŒlayers.7.feed_forward.w2.weight”†”Œ%model.layers.7.feed_forward.w3.weight”j=EŒlayers.7.feed_forward.w3.weight”†”Œ$model.layers.7.attention_norm.weight”j=EŒlayers.7.attention_norm.weight”†”Œmodel.layers.7.ffn_norm.weight”j=EŒlayers.7.ffn_norm.weight”†”Œ"model.layers.8.attention.wq.weight”j=EŒlayers.8.attention.wq.weight”†”Œ"model.layers.8.attention.wk.weight”j=EŒlayers.8.attention.wk.weight”†”Œ"model.layers.8.attention.wv.weight”j=EŒlayers.8.attention.wv.weight”†”Œ"model.layers.8.attention.wo.weight”j=EŒlayers.8.attention.wo.weight”†”Œ%model.layers.8.feed_forward.w1.weight”j=EŒlayers.8.feed_forward.w1.weight”†”Œ%model.layers.8.feed_forward.w2.weight”j=EŒlayers.8.feed_forward.w2.weight”†”Œ%model.layers.8.feed_forward.w3.weight”j=EŒlayers.8.feed_forward.w3.weight”†”Œ$model.layers.8.attention_norm.weight”j=EŒlayers.8.attention_norm.weight”†”Œmodel.layers.8.ffn_norm.weight”j=EŒlayers.8.ffn_norm.weight”†”Œ"model.layers.9.attention.wq.weight”j=EŒlayers.9.attention.wq.weight”†”Œ"model.layers.9.attention.wk.weight”j=EŒlayers.9.attention.wk.weight”†”Œ"model.layers.9.attention.wv.weight”j=EŒlayers.9.attention.wv.weight”†”Œ"model.layers.9.attention.wo.weight”j=EŒlayers.9.attention.wo.weight”†”Œ%model.layers.9.feed_forward.w1.weight”j=EŒlayers.9.feed_forward.w1.weight”†”Œ%model.layers.9.feed_forward.w2.weight”j=EŒlayers.9.feed_forward.w2.weight”†”Œ%model.layers.9.feed_forward.w3.weight”j=EŒlayers.9.feed_forward.w3.weight”†”Œ$model.layers.9.attention_norm.weight”j=EŒlayers.9.attention_norm.weight”†”Œmodel.layers.9.ffn_norm.weight”j=EŒlayers.9.ffn_norm.weight”†”Œ#model.layers.10.attention.wq.weight”j=EŒlayers.10.attention.wq.weight”†”Œ#model.layers.10.attention.wk.weight”j=EŒlayers.10.attention.wk.weight”†”Œ#model.layers.10.attention.wv.weight”j=EŒlayers.10.attention.wv.weight”†”Œ#model.layers.10.attention.wo.weight”j=EŒlayers.10.attention.wo.weight”†”Œ&model.layers.10.feed_forward.w1.weight”j=EŒ layers.10.feed_forward.w1.weight”†”Œ&model.layers.10.feed_forward.w2.weight”j=EŒ layers.10.feed_forward.w2.weight”†”Œ&model.layers.10.feed_forward.w3.weight”j=EŒ layers.10.feed_forward.w3.weight”†”Œ%model.layers.10.attention_norm.weight”j=EŒlayers.10.attention_norm.weight”†”Œmodel.layers.10.ffn_norm.weight”j=EŒlayers.10.ffn_norm.weight”†”Œ#model.layers.11.attention.wq.weight”j=EŒlayers.11.attention.wq.weight”†”Œ#model.layers.11.attention.wk.weight”j=EŒlayers.11.attention.wk.weight”†”Œ#model.layers.11.attention.wv.weight”j=EŒlayers.11.attention.wv.weight”†”Œ#model.layers.11.attention.wo.weight”j=EŒlayers.11.attention.wo.weight”†”Œ&model.layers.11.feed_forward.w1.weight”j=EŒ layers.11.feed_forward.w1.weight”†”Œ&model.layers.11.feed_forward.w2.weight”j=EŒ layers.11.feed_forward.w2.weight”†”Œ&model.layers.11.feed_forward.w3.weight”j=EŒ layers.11.feed_forward.w3.weight”†”Œ%model.layers.11.attention_norm.weight”j=EŒlayers.11.attention_norm.weight”†”Œmodel.layers.11.ffn_norm.weight”j=EŒlayers.11.ffn_norm.weight”†”Œmodel.norm.weight”j=EŒ norm.weight”†”Œmodel.output.weight”j=EŒ output.weight”†”Œ*optimizer.state.tok_embeddings.weight.step”Œ optimizer”Œ state.tok_embeddings.weight.step”†”Œ-optimizer.state.tok_embeddings.weight.exp_avg”jŽFŒ#state.tok_embeddings.weight.exp_avg”†”Œ0optimizer.state.tok_embeddings.weight.exp_avg_sq”jŽFŒ&state.tok_embeddings.weight.exp_avg_sq”†”Œ1optimizer.state.layers.0.attention.wq.weight.step”jŽFŒ'state.layers.0.attention.wq.weight.step”†”Œ4optimizer.state.layers.0.attention.wq.weight.exp_avg”jŽFŒ*state.layers.0.attention.wq.weight.exp_avg”†”•Œ7optimizer.state.layers.0.attention.wq.weight.exp_avg_sq”jŽFŒ-state.layers.0.attention.wq.weight.exp_avg_sq”†”j@jŽFŒ'state.layers.0.attention.wk.weight.step”†”Œ4optimizer.state.layers.0.attention.wk.weight.exp_avg”jŽFŒ*state.layers.0.attention.wk.weight.exp_avg”†”Œ7optimizer.state.layers.0.attention.wk.weight.exp_avg_sq”jŽFŒ-state.layers.0.attention.wk.weight.exp_avg_sq”†”Œ1optimizer.state.layers.0.attention.wv.weight.step”jŽFŒ'state.layers.0.attention.wv.weight.step”†”Œ4optimizer.state.layers.0.attention.wv.weight.exp_avg”jŽFŒ*state.layers.0.attention.wv.weight.exp_avg”†”Œ7optimizer.state.layers.0.attention.wv.weight.exp_avg_sq”jŽFŒ-state.layers.0.attention.wv.weight.exp_avg_sq”†”Œ1optimizer.state.layers.0.attention.wo.weight.step”jŽFŒ'state.layers.0.attention.wo.weight.step”†”Œ4optimizer.state.layers.0.attention.wo.weight.exp_avg”jŽFŒ*state.layers.0.attention.wo.weight.exp_avg”†”Œ7optimizer.state.layers.0.attention.wo.weight.exp_avg_sq”jŽFŒ-state.layers.0.attention.wo.weight.exp_avg_sq”†”j@jŽFŒ*state.layers.0.feed_forward.w1.weight.step”†”Œ7optimizer.state.layers.0.feed_forward.w1.weight.exp_avg”jŽFŒ-state.layers.0.feed_forward.w1.weight.exp_avg”†”Œ:optimizer.state.layers.0.feed_forward.w1.weight.exp_avg_sq”jŽFŒ0state.layers.0.feed_forward.w1.weight.exp_avg_sq”†”Œ4optimizer.state.layers.0.feed_forward.w2.weight.step”jŽFŒ*state.layers.0.feed_forward.w2.weight.step”†”Œ7optimizer.state.layers.0.feed_forward.w2.weight.exp_avg”jŽFŒ-state.layers.0.feed_forward.w2.weight.exp_avg”†”Œ:optimizer.state.layers.0.feed_forward.w2.weight.exp_avg_sq”jŽFŒ0state.layers.0.feed_forward.w2.weight.exp_avg_sq”†”Œ4optimizer.state.layers.0.feed_forward.w3.weight.step”jŽFŒ*state.layers.0.feed_forward.w3.weight.step”†”Œ7optimizer.state.layers.0.feed_forward.w3.weight.exp_avg”jŽFŒ-state.layers.0.feed_forward.w3.weight.exp_avg”†”Œ:optimizer.state.layers.0.feed_forward.w3.weight.exp_avg_sq”jŽFŒ0state.layers.0.feed_forward.w3.weight.exp_avg_sq”†”j@jŽFŒ)state.layers.0.attention_norm.weight.step”†”Œ6optimizer.state.layers.0.attention_norm.weight.exp_avg”jŽFŒ,state.layers.0.attention_norm.weight.exp_avg”†”Œ9optimizer.state.layers.0.attention_norm.weight.exp_avg_sq”jŽFŒ/state.layers.0.attention_norm.weight.exp_avg_sq”†”Œ-optimizer.state.layers.0.ffn_norm.weight.step”jŽFŒ#state.layers.0.ffn_norm.weight.step”†”Œ0optimizer.state.layers.0.ffn_norm.weight.exp_avg”jŽFŒ&state.layers.0.ffn_norm.weight.exp_avg”†”Œ3optimizer.state.layers.0.ffn_norm.weight.exp_avg_sq”jŽFŒ)state.layers.0.ffn_norm.weight.exp_avg_sq”†”Œ1optimizer.state.layers.1.attention.wq.weight.step”jŽFŒ'state.layers.1.attention.wq.weight.step”†”Œ4optimizer.state.layers.1.attention.wq.weight.exp_avg”jŽFŒ*state.layers.1.attention.wq.weight.exp_avg”†”Œ7optimizer.state.layers.1.attention.wq.weight.exp_avg_sq”jŽFŒ-state.layers.1.attention.wq.weight.exp_avg_sq”†”j*@jŽFŒ'state.layers.1.attention.wk.weight.step”†”Œ4optimizer.state.layers.1.attention.wk.weight.exp_avg”jŽFŒ*state.layers.1.attention.wk.weight.exp_avg”†”Œ7optimizer.state.layers.1.attention.wk.weight.exp_avg_sq”jŽFŒ-state.layers.1.attention.wk.weight.exp_avg_sq”†”Œ1optimizer.state.layers.1.attention.wv.weight.step”jŽFŒ'state.layers.1.attention.wv.weight.step”†”Œ4optimizer.state.layers.1.attention.wv.weight.exp_avg”jŽFŒ*state.layers.1.attention.wv.weight.exp_avg”†”Œ7optimizer.state.layers.1.attention.wv.weight.exp_avg_sq”jŽFŒ-state.layers.1.attention.wv.weight.exp_avg_sq”†”Œ1optimizer.state.layers.1.attention.wo.weight.step”jŽFŒ'state.layers.1.attention.wo.weight.step”†”Œ4optimizer.state.layers.1.attention.wo.weight.exp_avg”jŽFŒ*state.layers.1.attention.wo.weight.exp_avg”†”Œ7optimizer.state.layers.1.attention.wo.weight.exp_avg_sq”jŽFŒ-state.layers.1.attention.wo.weight.exp_avg_sq”†”j8@jŽFŒ*state.layers.1.feed_forward.w1.weight.step”†”Œ7optimizer.state.layers.1.feed_forward.w1.weight.exp_avg”jŽFŒ-state.layers.1.feed_forward.w1.weight.exp_avg”†”Œ:optimizer.state.layers.1.feed_forward.w1.weight.exp_avg_sq”jŽFŒ0state.layers.1.feed_forward.w1.weight.exp_avg_sq”†”Œ4optimizer.state.layers.1.feed_forward.w2.weight.step”jŽFŒ*state.layers.1.feed_forward.w2.weight.step”†”Œ7optimizer.state.layers.1.feed_forward.w2.weight.exp_avg”jŽFŒ-state.layers.1.feed_forward.w2.weight.exp_avg”†”Œ:optimizer.state.layers.1.feed_forward.w2.weight.exp_avg_sq”jŽFŒ0state.layers.1.feed_forward.w2.weight.exp_avg_sq”†”Œ4optimizer.state.layers.1.feed_forward.w3.weight.step”jŽFŒ*state.layers.1.feed_forward.w3.weight.step”†”Œ7optimizer.state.layers.1.feed_forward.w3.weight.exp_avg”jŽFŒ-state.layers.1.feed_forward.w3.weight.exp_avg”†”Œ:optimizer.state.layers.1.feed_forward.w3.weight.exp_avg_sq”jŽFŒ0state.layers.1.feed_forward.w3.weight.exp_avg_sq”†”jF@jŽFŒ)state.layers.1.attention_norm.weight.step”†”Œ6optimizer.state.layers.1.attention_norm.weight.exp_avg”jŽFŒ,state.layers.1.attention_norm.weight.exp_avg”†”Œ9optimizer.state.layers.1.attention_norm.weight.exp_avg_sq”jŽFŒ/state.layers.1.attention_norm.weight.exp_avg_sq”†”Œ-optimizer.state.layers.1.ffn_norm.weight.step”jŽFŒ#state.layers.1.ffn_norm.weight.step”†”Œ0optimizer.state.layers.1.ffn_norm.weight.exp_avg”jŽFŒ&state.layers.1.ffn_norm.weight.exp_avg”†”Œ3optimizer.state.layers.1.ffn_norm.weight.exp_avg_sq”jŽFŒ)state.layers.1.ffn_norm.weight.exp_avg_sq”†”Œ1optimizer.state.layers.2.attention.wq.weight.step”jŽFŒ'state.layers.2.attention.wq.weight.step”†”Œ4optimizer.state.layers.2.attention.wq.weight.exp_avg”jŽFŒ*state.layers.2.attention.wq.weight.exp_avg”†”Œ7optimizer.state.layers.2.attention.wq.weight.exp_avg_sq”jŽFŒ-state.layers.2.attention.wq.weight.exp_avg_sq”†”jT@jŽFŒ'state.layers.2.attention.wk.weight.step”†”Œ4optimizer.state.layers.2.attention.wk.weight.exp_avg”jŽFŒ*state.layers.2.attention.wk.weight.exp_avg”†”Œ7optimizer.state.layers.2.attention.wk.weight.exp_avg_sq”jŽFŒ-state.layers.2.attention.wk.weight.exp_avg_sq”†”Œ1optimizer.state.layers.2.attention.wv.weight.step”jŽFŒ'state.layers.2.attention.wv.weight.step”†”Œ4optimizer.state.layers.2.attention.wv.weight.exp_avg”jŽFŒ*state.layers.2.attention.wv.weight.exp_avg”†”Œ7optimizer.state.layers.2.attention.wv.weight.exp_avg_sq”jŽFŒ-state.layers.2.attention.wv.weight.exp_avg_sq”†”Œ1optimizer.state.layers.2.attention.wo.weight.step”jŽFŒ'state.layers.2.attention.wo.weight.step”†”Œ4optimizer.state.layers.2.attention.wo.weight.exp_avg”jŽFŒ*state.layers.2.attention.wo.weight.exp_avg”†”Œ7optimizer.state.layers.2.attention.wo.weight.exp_avg_sq”jŽFŒ-state.layers.2.attention.wo.weight.exp_avg_sq”†”jb@jŽFŒ*state.layers.2.feed_forward.w1.weight.step”†”Œ7optimizer.state.layers.2.feed_forward.w1.weight.exp_avg”jŽFŒ-state.layers.2.feed_forward.w1.weight.exp_avg”†”Œ:optimizer.state.layers.2.feed_forward.w1.weight.exp_avg_sq”jŽFŒ0state.layers.2.feed_forward.w1.weight.exp_avg_sq”†”Œ4optimizer.state.layers.2.feed_forward.w2.weight.step”jŽFŒ*state.layers.2.feed_forward.w2.weight.step”†”Œ7optimizer.state.layers.2.feed_forward.w2.weight.exp_avg”jŽFŒ-state.layers.2.feed_forward.w2.weight.exp_avg”†”Œ:optimizer.state.layers.2.feed_forward.w2.weight.exp_avg_sq”jŽFŒ0state.layers.2.feed_forward.w2.weight.exp_avg_sq”†”Œ4optimizer.state.layers.2.feed_forward.w3.weight.step”jŽFŒ*state.layers.2.feed_forward.w3.weight.step”†”Œ7optimizer.state.layers.2.feed_forward.w3.weight.exp_avg”jŽFŒ-state.layers.2.feed_forward.w3.weight.exp_avg”†”Œ:optimizer.state.layers.2.feed_forward.w3.weight.exp_avg_sq”jŽFŒ0state.layers.2.feed_forward.w3.weight.exp_avg_sq”†”jp@jŽFŒ)state.layers.2.attention_norm.weight.step”†”Œ6optimizer.state.layers.2.attention_norm.weight.exp_avg”jŽFŒ,state.layers.2.attention_norm.weight.exp_avg”†”Œ9optimizer.state.layers.2.attention_norm.weight.exp_avg_sq”jŽFŒ/state.layers.2.attention_norm.weight.exp_avg_sq”†”Œ-optimizer.state.layers.2.ffn_norm.weight.step”jŽFŒ#state.layers.2.ffn_norm.weight.step”†”Œ0optimizer.state.layers.2.ffn_norm.weight.exp_avg”jŽFŒ&state.layers.2.ffn_norm.weight.exp_avg”†”Œ3optimizer.state.layers.2.ffn_norm.weight.exp_avg_sq”jŽFŒ)state.layers.2.ffn_norm.weight.exp_avg_sq”†”Œ1optimizer.state.layers.3.attention.wq.weight.step”jŽFŒ'state.layers.3.attention.wq.weight.step”†”Œ4optimizer.state.layers.3.attention.wq.weight.exp_avg”jŽFŒ*state.layers.3.attention.wq.weight.exp_avg”†”Œ7optimizer.state.layers.3.attention.wq.weight.exp_avg_sq”jŽFŒ-state.layers.3.attention.wq.weight.exp_avg_sq”†”j~@jŽFŒ'state.layers.3.attention.wk.weight.step”†”Œ4optimizer.state.layers.3.attention.wk.weight.exp_avg”jŽFŒ*state.layers.3.attention.wk.weight.exp_avg”†”Œ7optimizer.state.layers.3.attention.wk.weight.exp_avg_sq”jŽFŒ-state.layers.3.attention.wk.weight.exp_avg_sq”†”Œ1optimizer.state.layers.3.attention.wv.weight.step”jŽFŒ'state.layers.3.attention.wv.weight.step”†”Œ4optimizer.state.layers.3.attention.wv.weight.exp_avg”jŽFŒ*state.layers.3.attention.wv.weight.exp_avg”†”Œ7optimizer.state.layers.3.attention.wv.weight.exp_avg_sq”jŽFŒ-state.layers.3.attention.wv.weight.exp_avg_sq”†”Œ1optimizer.state.layers.3.attention.wo.weight.step”jŽFŒ'state.layers.3.attention.wo.weight.step”†”Œ4optimizer.state.layers.3.attention.wo.weight.exp_avg”jŽFŒ*state.layers.3.attention.wo.weight.exp_avg”†”Œ7optimizer.state.layers.3.attention.wo.weight.exp_avg_sq”jŽFŒ-state.layers.3.attention.wo.weight.exp_avg_sq”†”jŒ@jŽFŒ*state.layers.3.feed_forward.w1.weight.step”†”Œ7optimizer.state.layers.3.feed_forward.w1.weight.exp_avg”jŽFŒ-state.layers.3.feed_forward.w1.weight.exp_avg”†”Œ:optimizer.state.layers.3.feed_forward.w1.weight.exp_avg_sq”jŽFŒ0state.layers.3.feed_forward.w1.weight.exp_avg_sq”†”Œ4optimizer.state.layers.3.feed_forward.w2.weight.step”jŽFŒ*state.layers.3.feed_forward.w2.weight.step”†”Œ7optimizer.state.layers.3.feed_forward.w2.weight.exp_avg”jŽFŒ-state.layers.3.feed_forward.w2.weight.exp_avg”†”Œ:optimizer.state.layers.3.feed_forward.w2.weight.exp_avg_sq”jŽFŒ0state.layers.3.feed_forward.w2.weight.exp_avg_sq”†”Œ4optimizer.state.layers.3.feed_forward.w3.weight.step”jŽFŒ*state.layers.3.feed_forward.w3.weight.step”†”Œ7optimizer.state.layers.3.feed_forward.w3.weight.exp_avg”jŽFŒ-state.layers.3.feed_forward.w3.weight.exp_avg”†”Œ:optimizer.state.layers.3.feed_forward.w3.weight.exp_avg_sq”jŽFŒ0state.layers.3.feed_forward.w3.weight.exp_avg_sq”†”jš@jŽFŒ)state.layers.3.attention_norm.weight.step”†”Œ6optimizer.state.layers.3.attention_norm.weight.exp_avg”jŽFŒ,state.layers.3.attention_norm.weight.exp_avg”†”Œ9optimizer.state.layers.3.attention_norm.weight.exp_avg_sq”jŽFŒ/state.layers.3.attention_norm.weight.exp_avg_sq”†”Œ-optimizer.state.layers.3.ffn_norm.weight.step”jŽFŒ#state.layers.3.ffn_norm.weight.step”†”Œ0optimizer.state.layers.3.ffn_norm.weight.exp_avg”jŽFŒ&state.layers.3.ffn_norm.weight.exp_avg”†”Œ3optimizer.state.layers.3.ffn_norm.weight.exp_avg_sq”jŽFŒ)state.layers.3.ffn_norm.weight.exp_avg_sq”†”Œ1optimizer.state.layers.4.attention.wq.weight.step”jŽFŒ'state.layers.4.attention.wq.weight.step”†”Œ4optimizer.state.layers.4.attention.wq.weight.exp_avg”jŽFŒ*state.layers.4.attention.wq.weight.exp_avg”†”Œ7optimizer.state.layers.4.attention.wq.weight.exp_avg_sq”jŽFŒ-state.layers.4.attention.wq.weight.exp_avg_sq”†”j¨@jŽFŒ'state.layers.4.attention.wk.weight.step”†”Œ4optimizer.state.layers.4.attention.wk.weight.exp_avg”jŽFŒ*state.layers.4.attention.wk.weight.exp_avg”†”Œ7optimizer.state.layers.4.attention.wk.weight.exp_avg_sq”jŽFŒ-state.layers.4.attention.wk.weight.exp_avg_sq”†”Œ1optimizer.state.layers.4.attention.wv.weight.step”jŽFŒ'state.layers.4.attention.wv.weight.step”†”Œ4optimizer.state.layers.4.attention.wv.weight.exp_avg”jŽFŒ*state.layers.4.attention.wv.weight.exp_avg”†”Œ7optimizer.state.layers.4.attention.wv.weight.exp_avg_sq”jŽFŒ-state.layers.4.attention.wv.weight.exp_avg_sq”†”Œ1optimizer.state.layers.4.attention.wo.weight.step”jŽFŒ'state.layers.4.attention.wo.weight.step”†”Œ4optimizer.state.layers.4.attention.wo.weight.exp_avg”jŽFŒ*state.layers.4.attention.wo.weight.exp_avg”†”Œ7optimizer.state.layers.4.attention.wo.weight.exp_avg_sq”jŽFŒ-state.layers.4.attention.wo.weight.exp_avg_sq”†”j¶@jŽFŒ*state.layers.4.feed_forward.w1.weight.step”†”Œ7optimizer.state.layers.4.feed_forward.w1.weight.exp_avg”jŽFŒ-state.layers.4.feed_forward.w1.weight.exp_avg”†”Œ:optimizer.state.layers.4.feed_forward.w1.weight.exp_avg_sq”jŽFŒ0state.layers.4.feed_forward.w1.weight.exp_avg_sq”†”Œ4optimizer.state.layers.4.feed_forward.w2.weight.step”jŽFŒ*state.layers.4.feed_forward.w2.weight.step”†”Œ7optimizer.state.layers.4.feed_forward.w2.weight.exp_avg”jŽFŒ-state.layers.4.feed_forward.w2.weight.exp_avg”†”Œ:optimizer.state.layers.4.feed_forward.w2.weight.exp_avg_sq”jŽFŒ0state.layers.4.feed_forward.w2.weight.exp_avg_sq”†”Œ4optimizer.state.layers.4.feed_forward.w3.weight.step”jŽFŒ*state.layers.4.feed_forward.w3.weight.step”†”Œ7optimizer.state.layers.4.feed_forward.w3.weight.exp_avg”jŽFŒ-state.layers.4.feed_forward.w3.weight.exp_avg”†”Œ:optimizer.state.layers.4.feed_forward.w3.weight.exp_avg_sq”jŽFŒ0state.layers.4.feed_forward.w3.weight.exp_avg_sq”†”jÄ@jŽFŒ)state.layers.4.attention_norm.weight.step”†”Œ6optimizer.state.layers.4.attention_norm.weight.exp_avg”jŽFŒ,state.layers.4.attention_norm.weight.exp_avg”†”Œ9optimizer.state.layers.4.attention_norm.weight.exp_avg_sq”jŽFŒ/state.layers.4.attention_norm.weight.exp_avg_sq”†”Œ-optimizer.state.layers.4.ffn_norm.weight.step”jŽFŒ#state.layers.4.ffn_norm.weight.step”†”Œ0optimizer.state.layers.4.ffn_norm.weight.exp_avg”jŽFŒ&state.layers.4.ffn_norm.weight.exp_avg”†”Œ3optimizer.state.layers.4.ffn_norm.weight.exp_avg_sq”jŽFŒ)state.layers.4.ffn_norm.weight.exp_avg_sq”†”Œ1optimizer.state.layers.5.attention.wq.weight.step”jŽFŒ'state.layers.5.attention.wq.weight.step”†”Œ4optimizer.state.layers.5.attention.wq.weight.exp_avg”jŽFŒ*state.layers.5.attention.wq.weight.exp_avg”†”Œ7optimizer.state.layers.5.attention.wq.weight.exp_avg_sq”jŽFŒ-state.layers.5.attention.wq.weight.exp_avg_sq”†”jÒ@jŽFŒ'state.layers.5.attention.wk.weight.step”†”Œ4optimizer.state.layers.5.attention.wk.weight.exp_avg”jŽFŒ*state.layers.5.attention.wk.weight.exp_avg”†”Œ7optimizer.state.layers.5.attention.wk.weight.exp_avg_sq”jŽFŒ-state.layers.5.attention.wk.weight.exp_avg_sq”†”Œ1optimizer.state.layers.5.attention.wv.weight.step”jŽFŒ'state.layers.5.attention.wv.weight.step”†”Œ4optimizer.state.layers.5.attention.wv.weight.exp_avg”jŽFŒ*state.layers.5.attention.wv.weight.exp_avg”†”Œ7optimizer.state.layers.5.attention.wv.weight.exp_avg_sq”jŽFŒ-state.layers.5.attention.wv.weight.exp_avg_sq”†”Œ1optimizer.state.layers.5.attention.wo.weight.step”jŽFŒ'state.layers.5.attention.wo.weight.step”†”Œ4optimizer.state.layers.5.attention.wo.weight.exp_avg”jŽFŒ*state.layers.5.attention.wo.weight.exp_avg”†”Œ7optimizer.state.layers.5.attention.wo.weight.exp_avg_sq”jŽFŒ-state.layers.5.attention.wo.weight.exp_avg_sq”†”jà@jŽFŒ*state.layers.5.feed_forward.w1.weight.step”†”Œ7optimizer.state.layers.5.feed_forward.w1.weight.exp_avg”jŽFŒ-state.layers.5.feed_forward.w1.weight.exp_avg”†”Œ:optimizer.state.layers.5.feed_forward.w1.weight.exp_avg_sq”jŽFŒ0state.layers.5.feed_forward.w1.weight.exp_avg_sq”†”Œ4optimizer.state.layers.5.feed_forward.w2.weight.step”jŽFŒ*state.layers.5.feed_forward.w2.weight.step”†”Œ7optimizer.state.layers.5.feed_forward.w2.weight.exp_avg”jŽFŒ-state.layers.5.feed_forward.w2.weight.exp_avg”†”Œ:optimizer.state.layers.5.feed_forward.w2.weight.exp_avg_sq”jŽFŒ0state.layers.5.feed_forward.w2.weight.exp_avg_sq”†”Œ4optimizer.state.layers.5.feed_forward.w3.weight.step”jŽFŒ*state.layers.5.feed_forward.w3.weight.step”†”Œ7optimizer.state.layers.5.feed_forward.w3.weight.exp_avg”jŽFŒ-state.layers.5.feed_forward.w3.weight.exp_avg”†”Œ:optimizer.state.layers.5.feed_forward.w3.weight.exp_avg_sq”jŽFŒ0state.layers.5.feed_forward.w3.weight.exp_avg_sq”†”jî@jŽFŒ)state.layers.5.attention_norm.weight.step”†”Œ6optimizer.state.layers.5.attention_norm.weight.exp_avg”jŽFŒ,state.layers.5.attention_norm.weight.exp_avg”†”Œ9optimizer.state.layers.5.attention_norm.weight.exp_avg_sq”jŽFŒ/state.layers.5.attention_norm.weight.exp_avg_sq”†”Œ-optimizer.state.layers.5.ffn_norm.weight.step”jŽFŒ#state.layers.5.ffn_norm.weight.step”†”Œ0optimizer.state.layers.5.ffn_norm.weight.exp_avg”jŽFŒ&state.layers.5.ffn_norm.weight.exp_avg”†”Œ3optimizer.state.layers.5.ffn_norm.weight.exp_avg_sq”jŽFŒ)state.layers.5.ffn_norm.weight.exp_avg_sq”†”Œ1optimizer.state.layers.6.attention.wq.weight.step”jŽFŒ'state.layers.6.attention.wq.weight.step”†”Œ4optimizer.state.layers.6.attention.wq.weight.exp_avg”jŽFŒ*state.layers.6.attention.wq.weight.exp_avg”†”Œ7optimizer.state.layers.6.attention.wq.weight.exp_avg_sq”jŽFŒ-state.layers.6.attention.wq.weight.exp_avg_sq”†”jü@jŽFŒ'state.layers.6.attention.wk.weight.step”†”Œ4optimizer.state.layers.6.attention.wk.weight.exp_avg”jŽFŒ*state.layers.6.attention.wk.weight.exp_avg”†”Œ7optimizer.state.layers.6.attention.wk.weight.exp_avg_sq”jŽFŒ-state.layers.6.attention.wk.weight.exp_avg_sq”†”Œ1optimizer.state.layers.6.attention.wv.weight.step”jŽFŒ'state.layers.6.attention.wv.weight.step”†”Œ4optimizer.state.layers.6.attention.wv.weight.exp_avg”jŽFŒ*state.layers.6.attention.wv.weight.exp_avg”†”Œ7optimizer.state.layers.6.attention.wv.weight.exp_avg_sq”jŽFŒ-state.layers.6.attention.wv.weight.exp_avg_sq”†”Œ1optimizer.state.layers.6.attention.wo.weight.step”jŽFŒ'state.layers.6.attention.wo.weight.step”†”Œ4optimizer.state.layers.6.attention.wo.weight.exp_avg”jŽFŒ*state.layers.6.attention.wo.weight.exp_avg”†”Œ7optimizer.state.layers.6.attention.wo.weight.exp_avg_sq”jŽFŒ-state.layers.6.attention.wo.weight.exp_avg_sq”†”j AjŽFŒ*state.layers.6.feed_forward.w1.weight.step”†”Œ7optimizer.state.layers.6.feed_forward.w1.weight.exp_avg”jŽFŒ-state.layers.6.feed_forward.w1.weight.exp_avg”†”Œ:optimizer.state.layers.6.feed_forward.w1.weight.exp_avg_sq”jŽFŒ0state.layers.6.feed_forward.w1.weight.exp_avg_sq”†”Œ4optimizer.state.layers.6.feed_forward.w2.weight.step”jŽFŒ*state.layers.6.feed_forward.w2.weight.step”†”Œ7optimizer.state.layers.6.feed_forward.w2.weight.exp_avg”jŽFŒ-state.layers.6.feed_forward.w2.weight.exp_avg”†”Œ:optimizer.state.layers.6.feed_forward.w2.weight.exp_avg_sq”jŽFŒ0state.layers.6.feed_forward.w2.weight.exp_avg_sq”†”Œ4optimizer.state.layers.6.feed_forward.w3.weight.step”jŽFŒ*state.layers.6.feed_forward.w3.weight.step”†”Œ7optimizer.state.layers.6.feed_forward.w3.weight.exp_avg”jŽFŒ-state.layers.6.feed_forward.w3.weight.exp_avg”†”Œ:optimizer.state.layers.6.feed_forward.w3.weight.exp_avg_sq”jŽFŒ0state.layers.6.feed_forward.w3.weight.exp_avg_sq”†”jAjŽFŒ)state.layers.6.attention_norm.weight.step”†”Œ6optimizer.state.layers.6.attention_norm.weight.exp_avg”jŽFŒ,state.layers.6.attention_norm.weight.exp_avg”†”Œ9optimizer.state.layers.6.attention_norm.weight.exp_avg_sq”jŽFŒ/state.layers.6.attention_norm.weight.exp_avg_sq”†”Œ-optimizer.state.layers.6.ffn_norm.weight.step”jŽFŒ#state.layers.6.ffn_norm.weight.step”†”Œ0optimizer.state.layers.6.ffn_norm.weight.exp_avg”jŽFŒ&state.layers.6.ffn_norm.weight.exp_avg”†”Œ3optimizer.state.layers.6.ffn_norm.weight.exp_avg_sq”jŽFŒ)state.layers.6.ffn_norm.weight.exp_avg_sq”†”Œ1optimizer.state.layers.7.attention.wq.weight.step”jŽFŒ'state.layers.7.attention.wq.weight.step”†”Œ4optimizer.state.layers.7.attention.wq.weight.exp_avg”jŽFŒ*state.layers.7.attention.wq.weight.exp_avg”†”Œ7optimizer.state.layers.7.attention.wq.weight.exp_avg_sq”jŽFŒ-state.layers.7.attention.wq.weight.exp_avg_sq”†”j&AjŽFŒ'state.layers.7.attention.wk.weight.step”†”Œ4optimizer.state.layers.7.attention.wk.weight.exp_avg”jŽFŒ*state.layers.7.attention.wk.weight.exp_avg”†”Œ7optimizer.state.layers.7.attention.wk.weight.exp_avg_sq”jŽFŒ-state.layers.7.attention.wk.weight.exp_avg_sq”†”Œ1optimizer.state.layers.7.attention.wv.weight.step”jŽFŒ'state.layers.7.attention.wv.weight.step”†”Œ4optimizer.state.layers.7.attention.wv.weight.exp_avg”jŽFŒ*state.layers.7.attention.wv.weight.exp_avg”†”Œ7optimizer.state.layers.7.attention.wv.weight.exp_avg_sq”jŽFŒ-state.layers.7.attention.wv.weight.exp_avg_sq”†”Œ1optimizer.state.layers.7.attention.wo.weight.step”jŽFŒ'state.layers.7.attention.wo.weight.step”†”Œ4optimizer.state.layers.7.attention.wo.weight.exp_avg”jŽFŒ*state.layers.7.attention.wo.weight.exp_avg”†”Œ7optimizer.state.layers.7.attention.wo.weight.exp_avg_sq”jŽFŒ-state.layers.7.attention.wo.weight.exp_avg_sq”†”j4AjŽFŒ*state.layers.7.feed_forward.w1.weight.step”†”Œ7optimizer.state.layers.7.feed_forward.w1.weight.exp_avg”jŽFŒ-state.layers.7.feed_forward.w1.weight.exp_avg”†”Œ:optimizer.state.layers.7.feed_forward.w1.weight.exp_avg_sq”jŽFŒ0state.layers.7.feed_forward.w1.weight.exp_avg_sq”†”Œ4optimizer.state.layers.7.feed_forward.w2.weight.step”jŽFŒ*state.layers.7.feed_forward.w2.weight.step”†”Œ7optimizer.state.layers.7.feed_forward.w2.weight.exp_avg”jŽFŒ-state.layers.7.feed_forward.w2.weight.exp_avg”†”Œ:optimizer.state.layers.7.feed_forward.w2.weight.exp_avg_sq”jŽFŒ0state.layers.7.feed_forward.w2.weight.exp_avg_sq”†”Œ4optimizer.state.layers.7.feed_forward.w3.weight.step”jŽFŒ*state.layers.7.feed_forward.w3.weight.step”†”Œ7optimizer.state.layers.7.feed_forward.w3.weight.exp_avg”jŽFŒ-state.layers.7.feed_forward.w3.weight.exp_avg”†”Œ:optimizer.state.layers.7.feed_forward.w3.weight.exp_avg_sq”jŽFŒ0state.layers.7.feed_forward.w3.weight.exp_avg_sq”†”jBAjŽFŒ)state.layers.7.attention_norm.weight.step”†”Œ6optimizer.state.layers.7.attention_norm.weight.exp_avg”jŽFŒ,state.layers.7.attention_norm.weight.exp_avg”†”Œ9optimizer.state.layers.7.attention_norm.weight.exp_avg_sq”jŽFŒ/state.layers.7.attention_norm.weight.exp_avg_sq”†”Œ-optimizer.state.layers.7.ffn_norm.weight.step”jŽFŒ#state.layers.7.ffn_norm.weight.step”†”Œ0optimizer.state.layers.7.ffn_norm.weight.exp_avg”jŽFŒ&state.layers.7.ffn_norm.weight.exp_avg”†”Œ3optimizer.state.layers.7.ffn_norm.weight.exp_avg_sq”jŽFŒ)state.layers.7.ffn_norm.weight.exp_avg_sq”†”Œ1optimizer.state.layers.8.attention.wq.weight.step”jŽFŒ'state.layers.8.attention.wq.weight.step”†”Œ4optimizer.state.layers.8.attention.wq.weight.exp_avg”jŽFŒ*state.layers.8.attention.wq.weight.exp_avg”†”Œ7optimizer.state.layers.8.attention.wq.weight.exp_avg_sq”jŽFŒ-state.layers.8.attention.wq.weight.exp_avg_sq”†”jPAjŽFŒ'state.layers.8.attention.wk.weight.step”†”Œ4optimizer.state.layers.8.attention.wk.weight.exp_avg”jŽFŒ*state.layers.8.attention.wk.weight.exp_avg”†”Œ7optimizer.state.layers.8.attention.wk.weight.exp_avg_sq”jŽFŒ-state.layers.8.attention.wk.weight.exp_avg_sq”†”Œ1optimizer.state.layers.8.attention.wv.weight.step”jŽFŒ'state.layers.8.attention.wv.weight.step”†”Œ4optimizer.state.layers.8.attention.wv.weight.exp_avg”jŽFŒ*state.layers.8.attention.wv.weight.exp_avg”†”Œ7optimizer.state.layers.8.attention.wv.weight.exp_avg_sq”jŽFŒ-state.layers.8.attention.wv.weight.exp_avg_sq”†”Œ1optimizer.state.layers.8.attention.wo.weight.step”jŽFŒ'state.layers.8.attention.wo.weight.step”†”Œ4optimizer.state.layers.8.attention.wo.weight.exp_avg”jŽFŒ*state.layers.8.attention.wo.weight.exp_avg”†”Œ7optimizer.state.layers.8.attention.wo.weight.exp_avg_sq”jŽFŒ-state.layers.8.attention.wo.weight.exp_avg_sq”†”j^AjŽFŒ*state.layers.8.feed_forward.w1.weight.step”†”Œ7optimizer.state.layers.8.feed_forward.w1.weight.exp_avg”jŽFŒ-state.layers.8.feed_forward.w1.weight.exp_avg”†”Œ:optimizer.state.layers.8.feed_forward.w1.weight.exp_avg_sq”jŽFŒ0state.layers.8.feed_forward.w1.weight.exp_avg_sq”†”Œ4optimizer.state.layers.8.feed_forward.w2.weight.step”jŽFŒ*state.layers.8.feed_forward.w2.weight.step”†”Œ7optimizer.state.layers.8.feed_forward.w2.weight.exp_avg”jŽFŒ-state.layers.8.feed_forward.w2.weight.exp_avg”†”Œ:optimizer.state.layers.8.feed_forward.w2.weight.exp_avg_sq”jŽFŒ0state.layers.8.feed_forward.w2.weight.exp_avg_sq”†”Œ4optimizer.state.layers.8.feed_forward.w3.weight.step”jŽFŒ*state.layers.8.feed_forward.w3.weight.step”†”Œ7optimizer.state.layers.8.feed_forward.w3.weight.exp_avg”jŽFŒ-state.layers.8.feed_forward.w3.weight.exp_avg”†”Œ:optimizer.state.layers.8.feed_forward.w3.weight.exp_avg_sq”jŽFŒ0state.layers.8.feed_forward.w3.weight.exp_avg_sq”†”jlAjŽFŒ)state.layers.8.attention_norm.weight.step”†”Œ6optimizer.state.layers.8.attention_norm.weight.exp_avg”jŽFŒ,state.layers.8.attention_norm.weight.exp_avg”†”Œ9optimizer.state.layers.8.attention_norm.weight.exp_avg_sq”jŽFŒ/state.layers.8.attention_norm.weight.exp_avg_sq”†”Œ-optimizer.state.layers.8.ffn_norm.weight.step”jŽFŒ#state.layers.8.ffn_norm.weight.step”†”Œ0optimizer.state.layers.8.ffn_norm.weight.exp_avg”jŽFŒ&state.layers.8.ffn_norm.weight.exp_avg”†”Œ3optimizer.state.layers.8.ffn_norm.weight.exp_avg_sq”jŽFŒ)state.layers.8.ffn_norm.weight.exp_avg_sq”†”Œ1optimizer.state.layers.9.attention.wq.weight.step”jŽFŒ'state.layers.9.attention.wq.weight.step”†”Œ4optimizer.state.layers.9.attention.wq.weight.exp_avg”jŽFŒ*state.layers.9.attention.wq.weight.exp_avg”†”Œ7optimizer.state.layers.9.attention.wq.weight.exp_avg_sq”jŽFŒ-state.layers.9.attention.wq.weight.exp_avg_sq”†”jzAjŽFŒ'state.layers.9.attention.wk.weight.step”†”Œ4optimizer.state.layers.9.attention.wk.weight.exp_avg”jŽFŒ*state.layers.9.attention.wk.weight.exp_avg”†”Œ7optimizer.state.layers.9.attention.wk.weight.exp_avg_sq”jŽFŒ-state.layers.9.attention.wk.weight.exp_avg_sq”†”Œ1optimizer.state.layers.9.attention.wv.weight.step”jŽFŒ'state.layers.9.attention.wv.weight.step”†”Œ4optimizer.state.layers.9.attention.wv.weight.exp_avg”jŽFŒ*state.layers.9.attention.wv.weight.exp_avg”†”Œ7optimizer.state.layers.9.attention.wv.weight.exp_avg_sq”jŽFŒ-state.layers.9.attention.wv.weight.exp_avg_sq”†”Œ1optimizer.state.layers.9.attention.wo.weight.step”jŽFŒ'state.layers.9.attention.wo.weight.step”†”Œ4optimizer.state.layers.9.attention.wo.weight.exp_avg”jŽFŒ*state.layers.9.attention.wo.weight.exp_avg”†”Œ7optimizer.state.layers.9.attention.wo.weight.exp_avg_sq”jŽFŒ-state.layers.9.attention.wo.weight.exp_avg_sq”†”jˆAjŽFŒ*state.layers.9.feed_forward.w1.weight.step”†”Œ7optimizer.state.layers.9.feed_forward.w1.weight.exp_avg”jŽFŒ-state.layers.9.feed_forward.w1.weight.exp_avg”†”Œ:optimizer.state.layers.9.feed_forward.w1.weight.exp_avg_sq”jŽFŒ0state.layers.9.feed_forward.w1.weight.exp_avg_sq”†”Œ4optimizer.state.layers.9.feed_forward.w2.weight.step”jŽFŒ*state.layers.9.feed_forward.w2.weight.step”†”Œ7optimizer.state.layers.9.feed_forward.w2.weight.exp_avg”jŽFŒ-state.layers.9.feed_forward.w2.weight.exp_avg”†”Œ:optimizer.state.layers.9.feed_forward.w2.weight.exp_avg_sq”jŽFŒ0state.layers.9.feed_forward.w2.weight.exp_avg_sq”†”Œ4optimizer.state.layers.9.feed_forward.w3.weight.step”jŽFŒ*state.layers.9.feed_forward.w3.weight.step”†”Œ7optimizer.state.layers.9.feed_forward.w3.weight.exp_avg”jŽFŒ-state.layers.9.feed_forward.w3.weight.exp_avg”†”Œ:optimizer.state.layers.9.feed_forward.w3.weight.exp_avg_sq”jŽFŒ0state.layers.9.feed_forward.w3.weight.exp_avg_sq”†”j–AjŽFŒ)state.layers.9.attention_norm.weight.step”†”Œ6optimizer.state.layers.9.attention_norm.weight.exp_avg”jŽFŒ,state.layers.9.attention_norm.weight.exp_avg”†”Œ9optimizer.state.layers.9.attention_norm.weight.exp_avg_sq”jŽFŒ/state.layers.9.attention_norm.weight.exp_avg_sq”†”Œ-optimizer.state.layers.9.ffn_norm.weight.step”jŽFŒ#state.layers.9.ffn_norm.weight.step”†”Œ0optimizer.state.layers.9.ffn_norm.weight.exp_avg”jŽFŒ&state.layers.9.ffn_norm.weight.exp_avg”†”Œ3optimizer.state.layers.9.ffn_norm.weight.exp_avg_sq”jŽFŒ)state.layers.9.ffn_norm.weight.exp_avg_sq”†”Œ2optimizer.state.layers.10.attention.wq.weight.step”jŽFŒ(state.layers.10.attention.wq.weight.step”†”Œ5optimizer.state.layers.10.attention.wq.weight.exp_avg”jŽFŒ+state.layers.10.attention.wq.weight.exp_avg”†”Œ8optimizer.state.layers.10.attention.wq.weight.exp_avg_sq”jŽFŒ.state.layers.10.attention.wq.weight.exp_avg_sq”†”j¤AjŽFŒ(state.layers.10.attention.wk.weight.step”†”Œ5optimizer.state.layers.10.attention.wk.weight.exp_avg”jŽFŒ+state.layers.10.attention.wk.weight.exp_avg”†”Œ8optimizer.state.layers.10.attention.wk.weight.exp_avg_sq”jŽFŒ.state.layers.10.attention.wk.weight.exp_avg_sq”†”Œ2optimizer.state.layers.10.attention.wv.weight.step”jŽFŒ(state.layers.10.attention.wv.weight.step”†”Œ5optimizer.state.layers.10.attention.wv.weight.exp_avg”jŽFŒ+state.layers.10.attention.wv.weight.exp_avg”†”Œ8optimizer.state.layers.10.attention.wv.weight.exp_avg_sq”jŽFŒ.state.layers.10.attention.wv.weight.exp_avg_sq”†”Œ2optimizer.state.layers.10.attention.wo.weight.step”jŽFŒ(state.layers.10.attention.wo.weight.step”†”Œ5optimizer.state.layers.10.attention.wo.weight.exp_avg”jŽFŒ+state.layers.10.attention.wo.weight.exp_avg”†”Œ8optimizer.state.layers.10.attention.wo.weight.exp_avg_sq”jŽFŒ.state.layers.10.attention.wo.weight.exp_avg_sq”†”j²AjŽFŒ+state.layers.10.feed_forward.w1.weight.step”†”Œ8optimizer.state.layers.10.feed_forward.w1.weight.exp_avg”jŽFŒ.state.layers.10.feed_forward.w1.weight.exp_avg”†”Œ;optimizer.state.layers.10.feed_forward.w1.weight.exp_avg_sq”jŽFŒ1state.layers.10.feed_forward.w1.weight.exp_avg_sq”†”Œ5optimizer.state.layers.10.feed_forward.w2.weight.step”jŽFŒ+state.layers.10.feed_forward.w2.weight.step”†”Œ8optimizer.state.layers.10.feed_forward.w2.weight.exp_avg”jŽFŒ.state.layers.10.feed_forward.w2.weight.exp_avg”†”Œ;optimizer.state.layers.10.feed_forward.w2.weight.exp_avg_sq”jŽFŒ1state.layers.10.feed_forward.w2.weight.exp_avg_sq”†”Œ5optimizer.state.layers.10.feed_forward.w3.weight.step”jŽFŒ+state.layers.10.feed_forward.w3.weight.step”†”Œ8optimizer.state.layers.10.feed_forward.w3.weight.exp_avg”jŽFŒ.state.layers.10.feed_forward.w3.weight.exp_avg”†”Œ;optimizer.state.layers.10.feed_forward.w3.weight.exp_avg_sq”jŽFŒ1state.layers.10.feed_forward.w3.weight.exp_avg_sq”†”jÀAjŽFŒ*state.layers.10.attention_norm.weight.step”†”Œ7optimizer.state.layers.10.attention_norm.weight.exp_avg”jŽFŒ-state.layers.10.attention_norm.weight.exp_avg”†”Œ:optimizer.state.layers.10.attention_norm.weight.exp_avg_sq”jŽFŒ0state.layers.10.attention_norm.weight.exp_avg_sq”†”Œ.optimizer.state.layers.10.ffn_norm.weight.step”jŽFŒ$state.layers.10.ffn_norm.weight.step”†”Œ1optimizer.state.layers.10.ffn_norm.weight.exp_avg”jŽFŒ'state.layers.10.ffn_norm.weight.exp_avg”†”Œ4optimizer.state.layers.10.ffn_norm.weight.exp_avg_sq”jŽFŒ*state.layers.10.ffn_norm.weight.exp_avg_sq”†”Œ2optimizer.state.layers.11.attention.wq.weight.step”jŽFŒ(state.layers.11.attention.wq.weight.step”†”Œ5optimizer.state.layers.11.attention.wq.weight.exp_avg”jŽFŒ+state.layers.11.attention.wq.weight.exp_avg”†”Œ8optimizer.state.layers.11.attention.wq.weight.exp_avg_sq”jŽFŒ.state.layers.11.attention.wq.weight.exp_avg_sq”†”jÎAjŽFŒ(state.layers.11.attention.wk.weight.step”†”Œ5optimizer.state.layers.11.attention.wk.weight.exp_avg”jŽFŒ+state.layers.11.attention.wk.weight.exp_avg”†”Œ8optimizer.state.layers.11.attention.wk.weight.exp_avg_sq”jŽFŒ.state.layers.11.attention.wk.weight.exp_avg_sq”†”Œ2optimizer.state.layers.11.attention.wv.weight.step”jŽFŒ(state.layers.11.attention.wv.weight.step”†”Œ5optimizer.state.layers.11.attention.wv.weight.exp_avg”jŽFŒ+state.layers.11.attention.wv.weight.exp_avg”†”Œ8optimizer.state.layers.11.attention.wv.weight.exp_avg_sq”jŽFŒ.state.layers.11.attention.wv.weight.exp_avg_sq”†”Œ2optimizer.state.layers.11.attention.wo.weight.step”jŽFŒ(state.layers.11.attention.wo.weight.step”†”Œ5optimizer.state.layers.11.attention.wo.weight.exp_avg”jŽFŒ+state.layers.11.attention.wo.weight.exp_avg”†”Œ8optimizer.state.layers.11.attention.wo.weight.exp_avg_sq”jŽFŒ.state.layers.11.attention.wo.weight.exp_avg_sq”†”jÜAjŽFŒ+state.layers.11.feed_forward.w1.weight.step”†”Œ8optimizer.state.layers.11.feed_forward.w1.weight.exp_avg”jŽFŒ.state.layers.11.feed_forward.w1.weight.exp_avg”†”Œ;optimizer.state.layers.11.feed_forward.w1.weight.exp_avg_sq”jŽFŒ1state.layers.11.feed_forward.w1.weight.exp_avg_sq”†”Œ5optimizer.state.layers.11.feed_forward.w2.weight.step”jŽFŒ+state.layers.11.feed_forward.w2.weight.step”†”Œ8optimizer.state.layers.11.feed_forward.w2.weight.exp_avg”jŽFŒ.state.layers.11.feed_forward.w2.weight.exp_avg”†”Œ;optimizer.state.layers.11.feed_forward.w2.weight.exp_avg_sq”jŽFŒ1state.layers.11.feed_forward.w2.weight.exp_avg_sq”†”Œ5optimizer.state.layers.11.feed_forward.w3.weight.step”jŽFŒ+state.layers.11.feed_forward.w3.weight.step”†”Œ8optimizer.state.layers.11.feed_forward.w3.weight.exp_avg”jŽFŒ.state.layers.11.feed_forward.w3.weight.exp_avg”†”Œ;optimizer.state.layers.11.feed_forward.w3.weight.exp_avg_sq”jŽFŒ1state.layers.11.feed_forward.w3.weight.exp_avg_sq”†”jêAjŽFŒ*state.layers.11.attention_norm.weight.step”†”Œ7optimizer.state.layers.11.attention_norm.weight.exp_avg”jŽFŒ-state.layers.11.attention_norm.weight.exp_avg”†”Œ:optimizer.state.layers.11.attention_norm.weight.exp_avg_sq”jŽFŒ0state.layers.11.attention_norm.weight.exp_avg_sq”†”Œ.optimizer.state.layers.11.ffn_norm.weight.step”jŽFŒ$state.layers.11.ffn_norm.weight.step”†”Œ1optimizer.state.layers.11.ffn_norm.weight.exp_avg”jŽFŒ'state.layers.11.ffn_norm.weight.exp_avg”†”Œ4optimizer.state.layers.11.ffn_norm.weight.exp_avg_sq”jŽFŒ*state.layers.11.ffn_norm.weight.exp_avg_sq”†”Œ optimizer.state.norm.weight.step”jŽFŒstate.norm.weight.step”†”Œ#optimizer.state.norm.weight.exp_avg”jŽFŒstate.norm.weight.exp_avg”†”Œ&optimizer.state.norm.weight.exp_avg_sq”jŽFŒstate.norm.weight.exp_avg_sq”†”jøAjŽFŒstate.output.weight.step”†”Œ%optimizer.state.output.weight.exp_avg”jŽFŒstate.output.weight.exp_avg”†”Œ(optimizer.state.output.weight.exp_avg_sq”jŽFŒstate.output.weight.exp_avg_sq”†”Œ/optimizer.param_groups.tok_embeddings.weight.lr”jŽFŒ%param_groups.tok_embeddings.weight.lr”†”Œ2optimizer.param_groups.tok_embeddings.weight.betas”jŽFŒ(param_groups.tok_embeddings.weight.betas”†”jBjŽFŒ¶m_groups.tok_embeddings.weight.eps”†”Œ9optimizer.param_groups.tok_embeddings.weight.weight_decay”jŽFŒ/param_groups.tok_embeddings.weight.weight_decay”†”Œ4optimizer.param_groups.tok_embeddings.weight.amsgrad”jŽFŒ*param_groups.tok_embeddings.weight.amsgrad”†”jBjŽFŒ*param_groups.tok_embeddings.weight.foreach”†”Œ5optimizer.param_groups.tok_embeddings.weight.maximize”jŽFŒ+param_groups.tok_embeddings.weight.maximize”†”Œ7optimizer.param_groups.tok_embeddings.weight.capturable”jŽFŒ-param_groups.tok_embeddings.weight.capturable”†”j BjŽFŒ1param_groups.tok_embeddings.weight.differentiable”†”Œ2optimizer.param_groups.tok_embeddings.weight.fused”jŽFŒ(param_groups.tok_embeddings.weight.fused”†”Œ7optimizer.param_groups.tok_embeddings.weight.initial_lr”jŽFŒ-param_groups.tok_embeddings.weight.initial_lr”†”j BjŽFŒ,param_groups.layers.0.attention.wq.weight.lr”†”Œ9optimizer.param_groups.layers.0.attention.wq.weight.betas”jŽFŒ/param_groups.layers.0.attention.wq.weight.betas”†”Œ7optimizer.param_groups.layers.0.attention.wq.weight.eps”jŽFŒ-param_groups.layers.0.attention.wq.weight.eps”†”jBjŽFŒ6param_groups.layers.0.attention.wq.weight.weight_decay”†”Œ;optimizer.param_groups.layers.0.attention.wq.weight.amsgrad”jŽFŒ1param_groups.layers.0.attention.wq.weight.amsgrad”†”Œ;optimizer.param_groups.layers.0.attention.wq.weight.foreach”jŽFŒ1param_groups.layers.0.attention.wq.weight.foreach”†”jBjŽFŒ2param_groups.layers.0.attention.wq.weight.maximize”†”Œ>optimizer.param_groups.layers.0.attention.wq.weight.capturable”jŽFŒ4param_groups.layers.0.attention.wq.weight.capturable”†”ŒBoptimizer.param_groups.layers.0.attention.wq.weight.differentiable”jŽFŒ8param_groups.layers.0.attention.wq.weight.differentiable”†”jBjŽFŒ/param_groups.layers.0.attention.wq.weight.fused”†”Œ>optimizer.param_groups.layers.0.attention.wq.weight.initial_lr”jŽFŒ4param_groups.layers.0.attention.wq.weight.initial_lr”†”Œ6optimizer.param_groups.layers.0.attention.wk.weight.lr”jŽFŒ,param_groups.layers.0.attention.wk.weight.lr”†”jBjŽFŒ/param_groups.layers.0.attention.wk.weight.betas”†”Œ7optimizer.param_groups.layers.0.attention.wk.weight.eps”jŽFŒ-param_groups.layers.0.attention.wk.weight.eps”†”Œ@optimizer.param_groups.layers.0.attention.wk.weight.weight_decay”jŽFŒ6param_groups.layers.0.attention.wk.weight.weight_decay”†”jBjŽFŒ1param_groups.layers.0.attention.wk.weight.amsgrad”†”Œ;optimizer.param_groups.layers.0.attention.wk.weight.foreach”jŽFŒ1param_groups.layers.0.attention.wk.weight.foreach”†”Œoptimizer.param_groups.layers.0.attention.wv.weight.capturable”jŽFŒ4param_groups.layers.0.attention.wv.weight.capturable”†”j BjŽFŒ8param_groups.layers.0.attention.wv.weight.differentiable”†”Œ9optimizer.param_groups.layers.0.attention.wv.weight.fused”jŽFŒ/param_groups.layers.0.attention.wv.weight.fused”†”Œ>optimizer.param_groups.layers.0.attention.wv.weight.initial_lr”jŽFŒ4param_groups.layers.0.attention.wv.weight.initial_lr”†”j"BjŽFŒ,param_groups.layers.0.attention.wo.weight.lr”†”Œ9optimizer.param_groups.layers.0.attention.wo.weight.betas”jŽFŒ/param_groups.layers.0.attention.wo.weight.betas”†”Œ7optimizer.param_groups.layers.0.attention.wo.weight.eps”jŽFŒ-param_groups.layers.0.attention.wo.weight.eps”†”j$BjŽFŒ6param_groups.layers.0.attention.wo.weight.weight_decay”†”Œ;optimizer.param_groups.layers.0.attention.wo.weight.amsgrad”jŽFŒ1param_groups.layers.0.attention.wo.weight.amsgrad”†”Œ;optimizer.param_groups.layers.0.attention.wo.weight.foreach”jŽFŒ1param_groups.layers.0.attention.wo.weight.foreach”†”j&BjŽFŒ2param_groups.layers.0.attention.wo.weight.maximize”†”Œ>optimizer.param_groups.layers.0.attention.wo.weight.capturable”jŽFŒ4param_groups.layers.0.attention.wo.weight.capturable”†”ŒBoptimizer.param_groups.layers.0.attention.wo.weight.differentiable”jŽFŒ8param_groups.layers.0.attention.wo.weight.differentiable”†”j(BjŽFŒ/param_groups.layers.0.attention.wo.weight.fused”†”Œ>optimizer.param_groups.layers.0.attention.wo.weight.initial_lr”jŽFŒ4param_groups.layers.0.attention.wo.weight.initial_lr”†”Œ9optimizer.param_groups.layers.0.feed_forward.w1.weight.lr”jŽFŒ/param_groups.layers.0.feed_forward.w1.weight.lr”†”j*BjŽFŒ2param_groups.layers.0.feed_forward.w1.weight.betas”†”Œ:optimizer.param_groups.layers.0.feed_forward.w1.weight.eps”jŽFŒ0param_groups.layers.0.feed_forward.w1.weight.eps”†”ŒCoptimizer.param_groups.layers.0.feed_forward.w1.weight.weight_decay”jŽFŒ9param_groups.layers.0.feed_forward.w1.weight.weight_decay”†”j,BjŽFŒ4param_groups.layers.0.feed_forward.w1.weight.amsgrad”†”Œ>optimizer.param_groups.layers.0.feed_forward.w1.weight.foreach”jŽFŒ4param_groups.layers.0.feed_forward.w1.weight.foreach”†”Œ?optimizer.param_groups.layers.0.feed_forward.w1.weight.maximize”jŽFŒ5param_groups.layers.0.feed_forward.w1.weight.maximize”†”j.BjŽFŒ7param_groups.layers.0.feed_forward.w1.weight.capturable”†”ŒEoptimizer.param_groups.layers.0.feed_forward.w1.weight.differentiable”jŽFŒ;param_groups.layers.0.feed_forward.w1.weight.differentiable”†”Œoptimizer.param_groups.layers.0.feed_forward.w2.weight.amsgrad”jŽFŒ4param_groups.layers.0.feed_forward.w2.weight.amsgrad”†”j4BjŽFŒ4param_groups.layers.0.feed_forward.w2.weight.foreach”†”Œ?optimizer.param_groups.layers.0.feed_forward.w2.weight.maximize”jŽFŒ5param_groups.layers.0.feed_forward.w2.weight.maximize”†”ŒAoptimizer.param_groups.layers.0.feed_forward.w2.weight.capturable”jŽFŒ7param_groups.layers.0.feed_forward.w2.weight.capturable”†”j6BjŽFŒ;param_groups.layers.0.feed_forward.w2.weight.differentiable”†”Œoptimizer.param_groups.layers.0.feed_forward.w3.weight.amsgrad”jŽFŒ4param_groups.layers.0.feed_forward.w3.weight.amsgrad”†”Œ>optimizer.param_groups.layers.0.feed_forward.w3.weight.foreach”jŽFŒ4param_groups.layers.0.feed_forward.w3.weight.foreach”†”jBjŽFŒ2param_groups.layers.0.feed_forward.w3.weight.fused”†”ŒAoptimizer.param_groups.layers.0.feed_forward.w3.weight.initial_lr”jŽFŒ7param_groups.layers.0.feed_forward.w3.weight.initial_lr”†”Œ8optimizer.param_groups.layers.0.attention_norm.weight.lr”jŽFŒ.param_groups.layers.0.attention_norm.weight.lr”†”j@BjŽFŒ1param_groups.layers.0.attention_norm.weight.betas”†”Œ9optimizer.param_groups.layers.0.attention_norm.weight.eps”jŽFŒ/param_groups.layers.0.attention_norm.weight.eps”†”ŒBoptimizer.param_groups.layers.0.attention_norm.weight.weight_decay”jŽFŒ8param_groups.layers.0.attention_norm.weight.weight_decay”†”jBBjŽFŒ3param_groups.layers.0.attention_norm.weight.amsgrad”†”Œ=optimizer.param_groups.layers.0.attention_norm.weight.foreach”jŽFŒ3param_groups.layers.0.attention_norm.weight.foreach”†”Œ>optimizer.param_groups.layers.0.attention_norm.weight.maximize”jŽFŒ4param_groups.layers.0.attention_norm.weight.maximize”†”jDBjŽFŒ6param_groups.layers.0.attention_norm.weight.capturable”†”ŒDoptimizer.param_groups.layers.0.attention_norm.weight.differentiable”jŽFŒ:param_groups.layers.0.attention_norm.weight.differentiable”†”Œ;optimizer.param_groups.layers.0.attention_norm.weight.fused”jŽFŒ1param_groups.layers.0.attention_norm.weight.fused”†”jFBjŽFŒ6param_groups.layers.0.attention_norm.weight.initial_lr”†”Œ2optimizer.param_groups.layers.0.ffn_norm.weight.lr”jŽFŒ(param_groups.layers.0.ffn_norm.weight.lr”†”Œ5optimizer.param_groups.layers.0.ffn_norm.weight.betas”jŽFŒ+param_groups.layers.0.ffn_norm.weight.betas”†”jHBjŽFŒ)param_groups.layers.0.ffn_norm.weight.eps”†”Œoptimizer.param_groups.layers.1.attention.wq.weight.capturable”jŽFŒ4param_groups.layers.1.attention.wq.weight.capturable”†”ŒBoptimizer.param_groups.layers.1.attention.wq.weight.differentiable”jŽFŒ8param_groups.layers.1.attention.wq.weight.differentiable”†”jTBjŽFŒ/param_groups.layers.1.attention.wq.weight.fused”†”Œ>optimizer.param_groups.layers.1.attention.wq.weight.initial_lr”jŽFŒ4param_groups.layers.1.attention.wq.weight.initial_lr”†”Œ6optimizer.param_groups.layers.1.attention.wk.weight.lr”jŽFŒ,param_groups.layers.1.attention.wk.weight.lr”†”jVBjŽFŒ/param_groups.layers.1.attention.wk.weight.betas”†”Œ7optimizer.param_groups.layers.1.attention.wk.weight.eps”jŽFŒ-param_groups.layers.1.attention.wk.weight.eps”†”Œ@optimizer.param_groups.layers.1.attention.wk.weight.weight_decay”jŽFŒ6param_groups.layers.1.attention.wk.weight.weight_decay”†”jXBjŽFŒ1param_groups.layers.1.attention.wk.weight.amsgrad”†”Œ;optimizer.param_groups.layers.1.attention.wk.weight.foreach”jŽFŒ1param_groups.layers.1.attention.wk.weight.foreach”†”Œoptimizer.param_groups.layers.1.attention.wv.weight.capturable”jŽFŒ4param_groups.layers.1.attention.wv.weight.capturable”†”jbBjŽFŒ8param_groups.layers.1.attention.wv.weight.differentiable”†”Œ9optimizer.param_groups.layers.1.attention.wv.weight.fused”jŽFŒ/param_groups.layers.1.attention.wv.weight.fused”†”Œ>optimizer.param_groups.layers.1.attention.wv.weight.initial_lr”jŽFŒ4param_groups.layers.1.attention.wv.weight.initial_lr”†”jdBjŽFŒ,param_groups.layers.1.attention.wo.weight.lr”†”Œ9optimizer.param_groups.layers.1.attention.wo.weight.betas”jŽFŒ/param_groups.layers.1.attention.wo.weight.betas”†”Œ7optimizer.param_groups.layers.1.attention.wo.weight.eps”jŽFŒ-param_groups.layers.1.attention.wo.weight.eps”†”jfBjŽFŒ6param_groups.layers.1.attention.wo.weight.weight_decay”†”Œ;optimizer.param_groups.layers.1.attention.wo.weight.amsgrad”jŽFŒ1param_groups.layers.1.attention.wo.weight.amsgrad”†”Œ;optimizer.param_groups.layers.1.attention.wo.weight.foreach”jŽFŒ1param_groups.layers.1.attention.wo.weight.foreach”†”jhBjŽFŒ2param_groups.layers.1.attention.wo.weight.maximize”†”Œ>optimizer.param_groups.layers.1.attention.wo.weight.capturable”jŽFŒ4param_groups.layers.1.attention.wo.weight.capturable”†”ŒBoptimizer.param_groups.layers.1.attention.wo.weight.differentiable”jŽFŒ8param_groups.layers.1.attention.wo.weight.differentiable”†”jjBjŽFŒ/param_groups.layers.1.attention.wo.weight.fused”†”Œ>optimizer.param_groups.layers.1.attention.wo.weight.initial_lr”jŽFŒ4param_groups.layers.1.attention.wo.weight.initial_lr”†”Œ9optimizer.param_groups.layers.1.feed_forward.w1.weight.lr”jŽFŒ/param_groups.layers.1.feed_forward.w1.weight.lr”†”jlBjŽFŒ2param_groups.layers.1.feed_forward.w1.weight.betas”†”Œ:optimizer.param_groups.layers.1.feed_forward.w1.weight.eps”jŽFŒ0param_groups.layers.1.feed_forward.w1.weight.eps”†”ŒCoptimizer.param_groups.layers.1.feed_forward.w1.weight.weight_decay”jŽFŒ9param_groups.layers.1.feed_forward.w1.weight.weight_decay”†”jnBjŽFŒ4param_groups.layers.1.feed_forward.w1.weight.amsgrad”†”Œ>optimizer.param_groups.layers.1.feed_forward.w1.weight.foreach”jŽFŒ4param_groups.layers.1.feed_forward.w1.weight.foreach”†”Œ?optimizer.param_groups.layers.1.feed_forward.w1.weight.maximize”jŽFŒ5param_groups.layers.1.feed_forward.w1.weight.maximize”†”jpBjŽFŒ7param_groups.layers.1.feed_forward.w1.weight.capturable”†”ŒEoptimizer.param_groups.layers.1.feed_forward.w1.weight.differentiable”jŽFŒ;param_groups.layers.1.feed_forward.w1.weight.differentiable”†”Œoptimizer.param_groups.layers.1.feed_forward.w2.weight.amsgrad”jŽFŒ4param_groups.layers.1.feed_forward.w2.weight.amsgrad”†”jvBjŽFŒ4param_groups.layers.1.feed_forward.w2.weight.foreach”†”Œ?optimizer.param_groups.layers.1.feed_forward.w2.weight.maximize”jŽFŒ5param_groups.layers.1.feed_forward.w2.weight.maximize”†”ŒAoptimizer.param_groups.layers.1.feed_forward.w2.weight.capturable”jŽFŒ7param_groups.layers.1.feed_forward.w2.weight.capturable”†”jxBjŽFŒ;param_groups.layers.1.feed_forward.w2.weight.differentiable”†”Œoptimizer.param_groups.layers.1.feed_forward.w3.weight.amsgrad”jŽFŒ4param_groups.layers.1.feed_forward.w3.weight.amsgrad”†”Œ>optimizer.param_groups.layers.1.feed_forward.w3.weight.foreach”jŽFŒ4param_groups.layers.1.feed_forward.w3.weight.foreach”†”j~BjŽFŒ5param_groups.layers.1.feed_forward.w3.weight.maximize”†”ŒAoptimizer.param_groups.layers.1.feed_forward.w3.weight.capturable”jŽFŒ7param_groups.layers.1.feed_forward.w3.weight.capturable”†”ŒEoptimizer.param_groups.layers.1.feed_forward.w3.weight.differentiable”jŽFŒ;param_groups.layers.1.feed_forward.w3.weight.differentiable”†”j€BjŽFŒ2param_groups.layers.1.feed_forward.w3.weight.fused”†”ŒAoptimizer.param_groups.layers.1.feed_forward.w3.weight.initial_lr”jŽFŒ7param_groups.layers.1.feed_forward.w3.weight.initial_lr”†”Œ8optimizer.param_groups.layers.1.attention_norm.weight.lr”jŽFŒ.param_groups.layers.1.attention_norm.weight.lr”†”j‚BjŽFŒ1param_groups.layers.1.attention_norm.weight.betas”†”Œ9optimizer.param_groups.layers.1.attention_norm.weight.eps”jŽFŒ/param_groups.layers.1.attention_norm.weight.eps”†”ŒBoptimizer.param_groups.layers.1.attention_norm.weight.weight_decay”jŽFŒ8param_groups.layers.1.attention_norm.weight.weight_decay”†”j„BjŽFŒ3param_groups.layers.1.attention_norm.weight.amsgrad”†”Œ=optimizer.param_groups.layers.1.attention_norm.weight.foreach”jŽFŒ3param_groups.layers.1.attention_norm.weight.foreach”†”Œ>optimizer.param_groups.layers.1.attention_norm.weight.maximize”jŽFŒ4param_groups.layers.1.attention_norm.weight.maximize”†”j†BjŽFŒ6param_groups.layers.1.attention_norm.weight.capturable”†”ŒDoptimizer.param_groups.layers.1.attention_norm.weight.differentiable”jŽFŒ:param_groups.layers.1.attention_norm.weight.differentiable”†”Œ;optimizer.param_groups.layers.1.attention_norm.weight.fused”jŽFŒ1param_groups.layers.1.attention_norm.weight.fused”†”jˆBjŽFŒ6param_groups.layers.1.attention_norm.weight.initial_lr”†”Œ2optimizer.param_groups.layers.1.ffn_norm.weight.lr”jŽFŒ(param_groups.layers.1.ffn_norm.weight.lr”†”Œ5optimizer.param_groups.layers.1.ffn_norm.weight.betas”jŽFŒ+param_groups.layers.1.ffn_norm.weight.betas”†”jŠBjŽFŒ)param_groups.layers.1.ffn_norm.weight.eps”†”Œoptimizer.param_groups.layers.2.attention.wq.weight.capturable”jŽFŒ4param_groups.layers.2.attention.wq.weight.capturable”†”ŒBoptimizer.param_groups.layers.2.attention.wq.weight.differentiable”jŽFŒ8param_groups.layers.2.attention.wq.weight.differentiable”†”j–BjŽFŒ/param_groups.layers.2.attention.wq.weight.fused”†”Œ>optimizer.param_groups.layers.2.attention.wq.weight.initial_lr”jŽFŒ4param_groups.layers.2.attention.wq.weight.initial_lr”†”Œ6optimizer.param_groups.layers.2.attention.wk.weight.lr”jŽFŒ,param_groups.layers.2.attention.wk.weight.lr”†”j˜BjŽFŒ/param_groups.layers.2.attention.wk.weight.betas”†”Œ7optimizer.param_groups.layers.2.attention.wk.weight.eps”jŽFŒ-param_groups.layers.2.attention.wk.weight.eps”†”Œ@optimizer.param_groups.layers.2.attention.wk.weight.weight_decay”jŽFŒ6param_groups.layers.2.attention.wk.weight.weight_decay”†”jšBjŽFŒ1param_groups.layers.2.attention.wk.weight.amsgrad”†”Œ;optimizer.param_groups.layers.2.attention.wk.weight.foreach”jŽFŒ1param_groups.layers.2.attention.wk.weight.foreach”†”Œoptimizer.param_groups.layers.2.attention.wv.weight.capturable”jŽFŒ4param_groups.layers.2.attention.wv.weight.capturable”†”j¤BjŽFŒ8param_groups.layers.2.attention.wv.weight.differentiable”†”Œ9optimizer.param_groups.layers.2.attention.wv.weight.fused”jŽFŒ/param_groups.layers.2.attention.wv.weight.fused”†”Œ>optimizer.param_groups.layers.2.attention.wv.weight.initial_lr”jŽFŒ4param_groups.layers.2.attention.wv.weight.initial_lr”†”j¦BjŽFŒ,param_groups.layers.2.attention.wo.weight.lr”†”Œ9optimizer.param_groups.layers.2.attention.wo.weight.betas”jŽFŒ/param_groups.layers.2.attention.wo.weight.betas”†”Œ7optimizer.param_groups.layers.2.attention.wo.weight.eps”jŽFŒ-param_groups.layers.2.attention.wo.weight.eps”†”j¨BjŽFŒ6param_groups.layers.2.attention.wo.weight.weight_decay”†”Œ;optimizer.param_groups.layers.2.attention.wo.weight.amsgrad”jŽFŒ1param_groups.layers.2.attention.wo.weight.amsgrad”†”Œ;optimizer.param_groups.layers.2.attention.wo.weight.foreach”jŽFŒ1param_groups.layers.2.attention.wo.weight.foreach”†”jªBjŽFŒ2param_groups.layers.2.attention.wo.weight.maximize”†”Œ>optimizer.param_groups.layers.2.attention.wo.weight.capturable”jŽFŒ4param_groups.layers.2.attention.wo.weight.capturable”†”ŒBoptimizer.param_groups.layers.2.attention.wo.weight.differentiable”jŽFŒ8param_groups.layers.2.attention.wo.weight.differentiable”†”j¬BjŽFŒ/param_groups.layers.2.attention.wo.weight.fused”†”Œ>optimizer.param_groups.layers.2.attention.wo.weight.initial_lr”jŽFŒ4param_groups.layers.2.attention.wo.weight.initial_lr”†”Œ9optimizer.param_groups.layers.2.feed_forward.w1.weight.lr”jŽFŒ/param_groups.layers.2.feed_forward.w1.weight.lr”†”j®BjŽFŒ2param_groups.layers.2.feed_forward.w1.weight.betas”†”Œ:optimizer.param_groups.layers.2.feed_forward.w1.weight.eps”jŽFŒ0param_groups.layers.2.feed_forward.w1.weight.eps”†”ŒCoptimizer.param_groups.layers.2.feed_forward.w1.weight.weight_decay”jŽFŒ9param_groups.layers.2.feed_forward.w1.weight.weight_decay”†”j°BjŽFŒ4param_groups.layers.2.feed_forward.w1.weight.amsgrad”†”Œ>optimizer.param_groups.layers.2.feed_forward.w1.weight.foreach”jŽFŒ4param_groups.layers.2.feed_forward.w1.weight.foreach”†”Œ?optimizer.param_groups.layers.2.feed_forward.w1.weight.maximize”jŽFŒ5param_groups.layers.2.feed_forward.w1.weight.maximize”†”j²BjŽFŒ7param_groups.layers.2.feed_forward.w1.weight.capturable”†”ŒEoptimizer.param_groups.layers.2.feed_forward.w1.weight.differentiable”jŽFŒ;param_groups.layers.2.feed_forward.w1.weight.differentiable”†”Œoptimizer.param_groups.layers.2.feed_forward.w2.weight.amsgrad”jŽFŒ4param_groups.layers.2.feed_forward.w2.weight.amsgrad”†”j¸BjŽFŒ4param_groups.layers.2.feed_forward.w2.weight.foreach”†”Œ?optimizer.param_groups.layers.2.feed_forward.w2.weight.maximize”jŽFŒ5param_groups.layers.2.feed_forward.w2.weight.maximize”†”ŒAoptimizer.param_groups.layers.2.feed_forward.w2.weight.capturable”jŽFŒ7param_groups.layers.2.feed_forward.w2.weight.capturable”†”jºBjŽFŒ;param_groups.layers.2.feed_forward.w2.weight.differentiable”†”Œoptimizer.param_groups.layers.2.feed_forward.w3.weight.amsgrad”jŽFŒ4param_groups.layers.2.feed_forward.w3.weight.amsgrad”†”Œ>optimizer.param_groups.layers.2.feed_forward.w3.weight.foreach”jŽFŒ4param_groups.layers.2.feed_forward.w3.weight.foreach”†”jÀBjŽFŒ5param_groups.layers.2.feed_forward.w3.weight.maximize”†”ŒAoptimizer.param_groups.layers.2.feed_forward.w3.weight.capturable”jŽFŒ7param_groups.layers.2.feed_forward.w3.weight.capturable”†”ŒEoptimizer.param_groups.layers.2.feed_forward.w3.weight.differentiable”jŽFŒ;param_groups.layers.2.feed_forward.w3.weight.differentiable”†”jÂBjŽFŒ2param_groups.layers.2.feed_forward.w3.weight.fused”†”ŒAoptimizer.param_groups.layers.2.feed_forward.w3.weight.initial_lr”jŽFŒ7param_groups.layers.2.feed_forward.w3.weight.initial_lr”†”Œ8optimizer.param_groups.layers.2.attention_norm.weight.lr”jŽFŒ.param_groups.layers.2.attention_norm.weight.lr”†”jÄBjŽFŒ1param_groups.layers.2.attention_norm.weight.betas”†”Œ9optimizer.param_groups.layers.2.attention_norm.weight.eps”jŽFŒ/param_groups.layers.2.attention_norm.weight.eps”†”ŒBoptimizer.param_groups.layers.2.attention_norm.weight.weight_decay”jŽFŒ8param_groups.layers.2.attention_norm.weight.weight_decay”†”jÆBjŽFŒ3param_groups.layers.2.attention_norm.weight.amsgrad”†”Œ=optimizer.param_groups.layers.2.attention_norm.weight.foreach”jŽFŒ3param_groups.layers.2.attention_norm.weight.foreach”†”Œ>optimizer.param_groups.layers.2.attention_norm.weight.maximize”jŽFŒ4param_groups.layers.2.attention_norm.weight.maximize”†”jÈBjŽFŒ6param_groups.layers.2.attention_norm.weight.capturable”†”ŒDoptimizer.param_groups.layers.2.attention_norm.weight.differentiable”jŽFŒ:param_groups.layers.2.attention_norm.weight.differentiable”†”Œ;optimizer.param_groups.layers.2.attention_norm.weight.fused”jŽFŒ1param_groups.layers.2.attention_norm.weight.fused”†”jÊBjŽFŒ6param_groups.layers.2.attention_norm.weight.initial_lr”†”Œ2optimizer.param_groups.layers.2.ffn_norm.weight.lr”jŽFŒ(param_groups.layers.2.ffn_norm.weight.lr”†”Œ5optimizer.param_groups.layers.2.ffn_norm.weight.betas”jŽFŒ+param_groups.layers.2.ffn_norm.weight.betas”†”jÌBjŽFŒ)param_groups.layers.2.ffn_norm.weight.eps”†”Œoptimizer.param_groups.layers.3.attention.wq.weight.capturable”jŽFŒ4param_groups.layers.3.attention.wq.weight.capturable”†”ŒBoptimizer.param_groups.layers.3.attention.wq.weight.differentiable”jŽFŒ8param_groups.layers.3.attention.wq.weight.differentiable”†”jØBjŽFŒ/param_groups.layers.3.attention.wq.weight.fused”†”Œ>optimizer.param_groups.layers.3.attention.wq.weight.initial_lr”jŽFŒ4param_groups.layers.3.attention.wq.weight.initial_lr”†”Œ6optimizer.param_groups.layers.3.attention.wk.weight.lr”jŽFŒ,param_groups.layers.3.attention.wk.weight.lr”†”jÚBjŽFŒ/param_groups.layers.3.attention.wk.weight.betas”†”Œ7optimizer.param_groups.layers.3.attention.wk.weight.eps”jŽFŒ-param_groups.layers.3.attention.wk.weight.eps”†”Œ@optimizer.param_groups.layers.3.attention.wk.weight.weight_decay”jŽFŒ6param_groups.layers.3.attention.wk.weight.weight_decay”†”jÜBjŽFŒ1param_groups.layers.3.attention.wk.weight.amsgrad”†”Œ;optimizer.param_groups.layers.3.attention.wk.weight.foreach”jŽFŒ1param_groups.layers.3.attention.wk.weight.foreach”†”Œoptimizer.param_groups.layers.3.attention.wv.weight.capturable”jŽFŒ4param_groups.layers.3.attention.wv.weight.capturable”†”jæBjŽFŒ8param_groups.layers.3.attention.wv.weight.differentiable”†”Œ9optimizer.param_groups.layers.3.attention.wv.weight.fused”jŽFŒ/param_groups.layers.3.attention.wv.weight.fused”†”Œ>optimizer.param_groups.layers.3.attention.wv.weight.initial_lr”jŽFŒ4param_groups.layers.3.attention.wv.weight.initial_lr”†”jèBjŽFŒ,param_groups.layers.3.attention.wo.weight.lr”†”Œ9optimizer.param_groups.layers.3.attention.wo.weight.betas”jŽFŒ/param_groups.layers.3.attention.wo.weight.betas”†”Œ7optimizer.param_groups.layers.3.attention.wo.weight.eps”jŽFŒ-param_groups.layers.3.attention.wo.weight.eps”†”jêBjŽFŒ6param_groups.layers.3.attention.wo.weight.weight_decay”†”Œ;optimizer.param_groups.layers.3.attention.wo.weight.amsgrad”jŽFŒ1param_groups.layers.3.attention.wo.weight.amsgrad”†”Œ;optimizer.param_groups.layers.3.attention.wo.weight.foreach”jŽFŒ1param_groups.layers.3.attention.wo.weight.foreach”†”jìBjŽFŒ2param_groups.layers.3.attention.wo.weight.maximize”†”Œ>optimizer.param_groups.layers.3.attention.wo.weight.capturable”jŽFŒ4param_groups.layers.3.attention.wo.weight.capturable”†”ŒBoptimizer.param_groups.layers.3.attention.wo.weight.differentiable”jŽFŒ8param_groups.layers.3.attention.wo.weight.differentiable”†”jîBjŽFŒ/param_groups.layers.3.attention.wo.weight.fused”†”Œ>optimizer.param_groups.layers.3.attention.wo.weight.initial_lr”jŽFŒ4param_groups.layers.3.attention.wo.weight.initial_lr”†”Œ9optimizer.param_groups.layers.3.feed_forward.w1.weight.lr”jŽFŒ/param_groups.layers.3.feed_forward.w1.weight.lr”†”jðBjŽFŒ2param_groups.layers.3.feed_forward.w1.weight.betas”†”Œ:optimizer.param_groups.layers.3.feed_forward.w1.weight.eps”jŽFŒ0param_groups.layers.3.feed_forward.w1.weight.eps”†”ŒCoptimizer.param_groups.layers.3.feed_forward.w1.weight.weight_decay”jŽFŒ9param_groups.layers.3.feed_forward.w1.weight.weight_decay”†”jòBjŽFŒ4param_groups.layers.3.feed_forward.w1.weight.amsgrad”†”Œ>optimizer.param_groups.layers.3.feed_forward.w1.weight.foreach”jŽFŒ4param_groups.layers.3.feed_forward.w1.weight.foreach”†”Œ?optimizer.param_groups.layers.3.feed_forward.w1.weight.maximize”jŽFŒ5param_groups.layers.3.feed_forward.w1.weight.maximize”†”jôBjŽFŒ7param_groups.layers.3.feed_forward.w1.weight.capturable”†”ŒEoptimizer.param_groups.layers.3.feed_forward.w1.weight.differentiable”jŽFŒ;param_groups.layers.3.feed_forward.w1.weight.differentiable”†”Œoptimizer.param_groups.layers.3.feed_forward.w2.weight.amsgrad”jŽFŒ4param_groups.layers.3.feed_forward.w2.weight.amsgrad”†”júBjŽFŒ4param_groups.layers.3.feed_forward.w2.weight.foreach”†”Œ?optimizer.param_groups.layers.3.feed_forward.w2.weight.maximize”jŽFŒ5param_groups.layers.3.feed_forward.w2.weight.maximize”†”ŒAoptimizer.param_groups.layers.3.feed_forward.w2.weight.capturable”jŽFŒ7param_groups.layers.3.feed_forward.w2.weight.capturable”†”jüBjŽFŒ;param_groups.layers.3.feed_forward.w2.weight.differentiable”†”Œoptimizer.param_groups.layers.3.feed_forward.w3.weight.amsgrad”jŽFŒ4param_groups.layers.3.feed_forward.w3.weight.amsgrad”†”Œ>optimizer.param_groups.layers.3.feed_forward.w3.weight.foreach”jŽFŒ4param_groups.layers.3.feed_forward.w3.weight.foreach”†”jCjŽFŒ5param_groups.layers.3.feed_forward.w3.weight.maximize”†”ŒAoptimizer.param_groups.layers.3.feed_forward.w3.weight.capturable”jŽFŒ7param_groups.layers.3.feed_forward.w3.weight.capturable”†”ŒEoptimizer.param_groups.layers.3.feed_forward.w3.weight.differentiable”jŽFŒ;param_groups.layers.3.feed_forward.w3.weight.differentiable”†”jCjŽFŒ2param_groups.layers.3.feed_forward.w3.weight.fused”†”ŒAoptimizer.param_groups.layers.3.feed_forward.w3.weight.initial_lr”jŽFŒ7param_groups.layers.3.feed_forward.w3.weight.initial_lr”†”Œ8optimizer.param_groups.layers.3.attention_norm.weight.lr”jŽFŒ.param_groups.layers.3.attention_norm.weight.lr”†”jCjŽFŒ1param_groups.layers.3.attention_norm.weight.betas”†”Œ9optimizer.param_groups.layers.3.attention_norm.weight.eps”jŽFŒ/param_groups.layers.3.attention_norm.weight.eps”†”ŒBoptimizer.param_groups.layers.3.attention_norm.weight.weight_decay”jŽFŒ8param_groups.layers.3.attention_norm.weight.weight_decay”†”jCjŽFŒ3param_groups.layers.3.attention_norm.weight.amsgrad”†”Œ=optimizer.param_groups.layers.3.attention_norm.weight.foreach”jŽFŒ3param_groups.layers.3.attention_norm.weight.foreach”†”Œ>optimizer.param_groups.layers.3.attention_norm.weight.maximize”jŽFŒ4param_groups.layers.3.attention_norm.weight.maximize”†”j CjŽFŒ6param_groups.layers.3.attention_norm.weight.capturable”†”ŒDoptimizer.param_groups.layers.3.attention_norm.weight.differentiable”jŽFŒ:param_groups.layers.3.attention_norm.weight.differentiable”†”Œ;optimizer.param_groups.layers.3.attention_norm.weight.fused”jŽFŒ1param_groups.layers.3.attention_norm.weight.fused”†”j CjŽFŒ6param_groups.layers.3.attention_norm.weight.initial_lr”†”Œ2optimizer.param_groups.layers.3.ffn_norm.weight.lr”jŽFŒ(param_groups.layers.3.ffn_norm.weight.lr”†”Œ5optimizer.param_groups.layers.3.ffn_norm.weight.betas”jŽFŒ+param_groups.layers.3.ffn_norm.weight.betas”†”jCjŽFŒ)param_groups.layers.3.ffn_norm.weight.eps”†”Œoptimizer.param_groups.layers.4.attention.wq.weight.capturable”jŽFŒ4param_groups.layers.4.attention.wq.weight.capturable”†”ŒBoptimizer.param_groups.layers.4.attention.wq.weight.differentiable”jŽFŒ8param_groups.layers.4.attention.wq.weight.differentiable”†”jCjŽFŒ/param_groups.layers.4.attention.wq.weight.fused”†”Œ>optimizer.param_groups.layers.4.attention.wq.weight.initial_lr”jŽFŒ4param_groups.layers.4.attention.wq.weight.initial_lr”†”Œ6optimizer.param_groups.layers.4.attention.wk.weight.lr”jŽFŒ,param_groups.layers.4.attention.wk.weight.lr”†”jCjŽFŒ/param_groups.layers.4.attention.wk.weight.betas”†”Œ7optimizer.param_groups.layers.4.attention.wk.weight.eps”jŽFŒ-param_groups.layers.4.attention.wk.weight.eps”†”Œ@optimizer.param_groups.layers.4.attention.wk.weight.weight_decay”jŽFŒ6param_groups.layers.4.attention.wk.weight.weight_decay”†”jCjŽFŒ1param_groups.layers.4.attention.wk.weight.amsgrad”†”Œ;optimizer.param_groups.layers.4.attention.wk.weight.foreach”jŽFŒ1param_groups.layers.4.attention.wk.weight.foreach”†”Œoptimizer.param_groups.layers.4.attention.wv.weight.capturable”jŽFŒ4param_groups.layers.4.attention.wv.weight.capturable”†”j(CjŽFŒ8param_groups.layers.4.attention.wv.weight.differentiable”†”Œ9optimizer.param_groups.layers.4.attention.wv.weight.fused”jŽFŒ/param_groups.layers.4.attention.wv.weight.fused”†”Œ>optimizer.param_groups.layers.4.attention.wv.weight.initial_lr”jŽFŒ4param_groups.layers.4.attention.wv.weight.initial_lr”†”j*CjŽFŒ,param_groups.layers.4.attention.wo.weight.lr”†”Œ9optimizer.param_groups.layers.4.attention.wo.weight.betas”jŽFŒ/param_groups.layers.4.attention.wo.weight.betas”†”Œ7optimizer.param_groups.layers.4.attention.wo.weight.eps”jŽFŒ-param_groups.layers.4.attention.wo.weight.eps”†”j,CjŽFŒ6param_groups.layers.4.attention.wo.weight.weight_decay”†”Œ;optimizer.param_groups.layers.4.attention.wo.weight.amsgrad”jŽFŒ1param_groups.layers.4.attention.wo.weight.amsgrad”†”Œ;optimizer.param_groups.layers.4.attention.wo.weight.foreach”jŽFŒ1param_groups.layers.4.attention.wo.weight.foreach”†”j.CjŽFŒ2param_groups.layers.4.attention.wo.weight.maximize”†”Œ>optimizer.param_groups.layers.4.attention.wo.weight.capturable”jŽFŒ4param_groups.layers.4.attention.wo.weight.capturable”†”ŒBoptimizer.param_groups.layers.4.attention.wo.weight.differentiable”jŽFŒ8param_groups.layers.4.attention.wo.weight.differentiable”†”j0CjŽFŒ/param_groups.layers.4.attention.wo.weight.fused”†”Œ>optimizer.param_groups.layers.4.attention.wo.weight.initial_lr”jŽFŒ4param_groups.layers.4.attention.wo.weight.initial_lr”†”Œ9optimizer.param_groups.layers.4.feed_forward.w1.weight.lr”jŽFŒ/param_groups.layers.4.feed_forward.w1.weight.lr”†”j2CjŽFŒ2param_groups.layers.4.feed_forward.w1.weight.betas”†”Œ:optimizer.param_groups.layers.4.feed_forward.w1.weight.eps”jŽFŒ0param_groups.layers.4.feed_forward.w1.weight.eps”†”ŒCoptimizer.param_groups.layers.4.feed_forward.w1.weight.weight_decay”jŽFŒ9param_groups.layers.4.feed_forward.w1.weight.weight_decay”†”j4CjŽFŒ4param_groups.layers.4.feed_forward.w1.weight.amsgrad”†”Œ>optimizer.param_groups.layers.4.feed_forward.w1.weight.foreach”jŽFŒ4param_groups.layers.4.feed_forward.w1.weight.foreach”†”Œ?optimizer.param_groups.layers.4.feed_forward.w1.weight.maximize”jŽFŒ5param_groups.layers.4.feed_forward.w1.weight.maximize”†”j6CjŽFŒ7param_groups.layers.4.feed_forward.w1.weight.capturable”†”ŒEoptimizer.param_groups.layers.4.feed_forward.w1.weight.differentiable”jŽFŒ;param_groups.layers.4.feed_forward.w1.weight.differentiable”†”Œoptimizer.param_groups.layers.4.feed_forward.w2.weight.amsgrad”jŽFŒ4param_groups.layers.4.feed_forward.w2.weight.amsgrad”†”jCjŽFŒ;param_groups.layers.4.feed_forward.w2.weight.differentiable”†”Œoptimizer.param_groups.layers.4.feed_forward.w3.weight.amsgrad”jŽFŒ4param_groups.layers.4.feed_forward.w3.weight.amsgrad”†”Œ>optimizer.param_groups.layers.4.feed_forward.w3.weight.foreach”jŽFŒ4param_groups.layers.4.feed_forward.w3.weight.foreach”†”jDCjŽFŒ5param_groups.layers.4.feed_forward.w3.weight.maximize”†”ŒAoptimizer.param_groups.layers.4.feed_forward.w3.weight.capturable”jŽFŒ7param_groups.layers.4.feed_forward.w3.weight.capturable”†”ŒEoptimizer.param_groups.layers.4.feed_forward.w3.weight.differentiable”jŽFŒ;param_groups.layers.4.feed_forward.w3.weight.differentiable”†”jFCjŽFŒ2param_groups.layers.4.feed_forward.w3.weight.fused”†”ŒAoptimizer.param_groups.layers.4.feed_forward.w3.weight.initial_lr”jŽFŒ7param_groups.layers.4.feed_forward.w3.weight.initial_lr”†”Œ8optimizer.param_groups.layers.4.attention_norm.weight.lr”jŽFŒ.param_groups.layers.4.attention_norm.weight.lr”†”jHCjŽFŒ1param_groups.layers.4.attention_norm.weight.betas”†”Œ9optimizer.param_groups.layers.4.attention_norm.weight.eps”jŽFŒ/param_groups.layers.4.attention_norm.weight.eps”†”ŒBoptimizer.param_groups.layers.4.attention_norm.weight.weight_decay”jŽFŒ8param_groups.layers.4.attention_norm.weight.weight_decay”†”jJCjŽFŒ3param_groups.layers.4.attention_norm.weight.amsgrad”†”Œ=optimizer.param_groups.layers.4.attention_norm.weight.foreach”jŽFŒ3param_groups.layers.4.attention_norm.weight.foreach”†”Œ>optimizer.param_groups.layers.4.attention_norm.weight.maximize”jŽFŒ4param_groups.layers.4.attention_norm.weight.maximize”†”jLCjŽFŒ6param_groups.layers.4.attention_norm.weight.capturable”†”ŒDoptimizer.param_groups.layers.4.attention_norm.weight.differentiable”jŽFŒ:param_groups.layers.4.attention_norm.weight.differentiable”†”Œ;optimizer.param_groups.layers.4.attention_norm.weight.fused”jŽFŒ1param_groups.layers.4.attention_norm.weight.fused”†”jNCjŽFŒ6param_groups.layers.4.attention_norm.weight.initial_lr”†”Œ2optimizer.param_groups.layers.4.ffn_norm.weight.lr”jŽFŒ(param_groups.layers.4.ffn_norm.weight.lr”†”Œ5optimizer.param_groups.layers.4.ffn_norm.weight.betas”jŽFŒ+param_groups.layers.4.ffn_norm.weight.betas”†”jPCjŽFŒ)param_groups.layers.4.ffn_norm.weight.eps”†”Œoptimizer.param_groups.layers.5.attention.wq.weight.capturable”jŽFŒ4param_groups.layers.5.attention.wq.weight.capturable”†”ŒBoptimizer.param_groups.layers.5.attention.wq.weight.differentiable”jŽFŒ8param_groups.layers.5.attention.wq.weight.differentiable”†”j\CjŽFŒ/param_groups.layers.5.attention.wq.weight.fused”†”Œ>optimizer.param_groups.layers.5.attention.wq.weight.initial_lr”jŽFŒ4param_groups.layers.5.attention.wq.weight.initial_lr”†”Œ6optimizer.param_groups.layers.5.attention.wk.weight.lr”jŽFŒ,param_groups.layers.5.attention.wk.weight.lr”†”j^CjŽFŒ/param_groups.layers.5.attention.wk.weight.betas”†”Œ7optimizer.param_groups.layers.5.attention.wk.weight.eps”jŽFŒ-param_groups.layers.5.attention.wk.weight.eps”†”Œ@optimizer.param_groups.layers.5.attention.wk.weight.weight_decay”jŽFŒ6param_groups.layers.5.attention.wk.weight.weight_decay”†”j`CjŽFŒ1param_groups.layers.5.attention.wk.weight.amsgrad”†”Œ;optimizer.param_groups.layers.5.attention.wk.weight.foreach”jŽFŒ1param_groups.layers.5.attention.wk.weight.foreach”†”Œoptimizer.param_groups.layers.5.attention.wv.weight.capturable”jŽFŒ4param_groups.layers.5.attention.wv.weight.capturable”†”jjCjŽFŒ8param_groups.layers.5.attention.wv.weight.differentiable”†”Œ9optimizer.param_groups.layers.5.attention.wv.weight.fused”jŽFŒ/param_groups.layers.5.attention.wv.weight.fused”†”Œ>optimizer.param_groups.layers.5.attention.wv.weight.initial_lr”jŽFŒ4param_groups.layers.5.attention.wv.weight.initial_lr”†”jlCjŽFŒ,param_groups.layers.5.attention.wo.weight.lr”†”Œ9optimizer.param_groups.layers.5.attention.wo.weight.betas”jŽFŒ/param_groups.layers.5.attention.wo.weight.betas”†”Œ7optimizer.param_groups.layers.5.attention.wo.weight.eps”jŽFŒ-param_groups.layers.5.attention.wo.weight.eps”†”jnCjŽFŒ6param_groups.layers.5.attention.wo.weight.weight_decay”†”Œ;optimizer.param_groups.layers.5.attention.wo.weight.amsgrad”jŽFŒ1param_groups.layers.5.attention.wo.weight.amsgrad”†”Œ;optimizer.param_groups.layers.5.attention.wo.weight.foreach”jŽFŒ1param_groups.layers.5.attention.wo.weight.foreach”†”jpCjŽFŒ2param_groups.layers.5.attention.wo.weight.maximize”†”Œ>optimizer.param_groups.layers.5.attention.wo.weight.capturable”jŽFŒ4param_groups.layers.5.attention.wo.weight.capturable”†”ŒBoptimizer.param_groups.layers.5.attention.wo.weight.differentiable”jŽFŒ8param_groups.layers.5.attention.wo.weight.differentiable”†”jrCjŽFŒ/param_groups.layers.5.attention.wo.weight.fused”†”Œ>optimizer.param_groups.layers.5.attention.wo.weight.initial_lr”jŽFŒ4param_groups.layers.5.attention.wo.weight.initial_lr”†”Œ9optimizer.param_groups.layers.5.feed_forward.w1.weight.lr”jŽFŒ/param_groups.layers.5.feed_forward.w1.weight.lr”†”jtCjŽFŒ2param_groups.layers.5.feed_forward.w1.weight.betas”†”Œ:optimizer.param_groups.layers.5.feed_forward.w1.weight.eps”jŽFŒ0param_groups.layers.5.feed_forward.w1.weight.eps”†”ŒCoptimizer.param_groups.layers.5.feed_forward.w1.weight.weight_decay”jŽFŒ9param_groups.layers.5.feed_forward.w1.weight.weight_decay”†”jvCjŽFŒ4param_groups.layers.5.feed_forward.w1.weight.amsgrad”†”u(Œ>optimizer.param_groups.layers.5.feed_forward.w1.weight.foreach”jŽFŒ4param_groups.layers.5.feed_forward.w1.weight.foreach”†”Œ?optimizer.param_groups.layers.5.feed_forward.w1.weight.maximize”jŽFŒ5param_groups.layers.5.feed_forward.w1.weight.maximize”†”jxCjŽFŒ7param_groups.layers.5.feed_forward.w1.weight.capturable”†”ŒEoptimizer.param_groups.layers.5.feed_forward.w1.weight.differentiable”jŽFŒ;param_groups.layers.5.feed_forward.w1.weight.differentiable”†”Œoptimizer.param_groups.layers.5.feed_forward.w2.weight.amsgrad”jŽFŒ4param_groups.layers.5.feed_forward.w2.weight.amsgrad”†”j~CjŽFŒ4param_groups.layers.5.feed_forward.w2.weight.foreach”†”Œ?optimizer.param_groups.layers.5.feed_forward.w2.weight.maximize”jŽFŒ5param_groups.layers.5.feed_forward.w2.weight.maximize”†”ŒAoptimizer.param_groups.layers.5.feed_forward.w2.weight.capturable”jŽFŒ7param_groups.layers.5.feed_forward.w2.weight.capturable”†”j€CjŽFŒ;param_groups.layers.5.feed_forward.w2.weight.differentiable”†”Œoptimizer.param_groups.layers.5.feed_forward.w3.weight.amsgrad”jŽFŒ4param_groups.layers.5.feed_forward.w3.weight.amsgrad”†”Œ>optimizer.param_groups.layers.5.feed_forward.w3.weight.foreach”jŽFŒ4param_groups.layers.5.feed_forward.w3.weight.foreach”†”j†CjŽFŒ5param_groups.layers.5.feed_forward.w3.weight.maximize”†”ŒAoptimizer.param_groups.layers.5.feed_forward.w3.weight.capturable”jŽFŒ7param_groups.layers.5.feed_forward.w3.weight.capturable”†”ŒEoptimizer.param_groups.layers.5.feed_forward.w3.weight.differentiable”jŽFŒ;param_groups.layers.5.feed_forward.w3.weight.differentiable”†”jˆCjŽFŒ2param_groups.layers.5.feed_forward.w3.weight.fused”†”ŒAoptimizer.param_groups.layers.5.feed_forward.w3.weight.initial_lr”jŽFŒ7param_groups.layers.5.feed_forward.w3.weight.initial_lr”†”Œ8optimizer.param_groups.layers.5.attention_norm.weight.lr”jŽFŒ.param_groups.layers.5.attention_norm.weight.lr”†”jŠCjŽFŒ1param_groups.layers.5.attention_norm.weight.betas”†”Œ9optimizer.param_groups.layers.5.attention_norm.weight.eps”jŽFŒ/param_groups.layers.5.attention_norm.weight.eps”†”ŒBoptimizer.param_groups.layers.5.attention_norm.weight.weight_decay”jŽFŒ8param_groups.layers.5.attention_norm.weight.weight_decay”†”jŒCjŽFŒ3param_groups.layers.5.attention_norm.weight.amsgrad”†”Œ=optimizer.param_groups.layers.5.attention_norm.weight.foreach”jŽFŒ3param_groups.layers.5.attention_norm.weight.foreach”†”Œ>optimizer.param_groups.layers.5.attention_norm.weight.maximize”jŽFŒ4param_groups.layers.5.attention_norm.weight.maximize”†”jŽCjŽFŒ6param_groups.layers.5.attention_norm.weight.capturable”†”ŒDoptimizer.param_groups.layers.5.attention_norm.weight.differentiable”jŽFŒ:param_groups.layers.5.attention_norm.weight.differentiable”†”Œ;optimizer.param_groups.layers.5.attention_norm.weight.fused”jŽFŒ1param_groups.layers.5.attention_norm.weight.fused”†”jCjŽFŒ6param_groups.layers.5.attention_norm.weight.initial_lr”†”Œ2optimizer.param_groups.layers.5.ffn_norm.weight.lr”jŽFŒ(param_groups.layers.5.ffn_norm.weight.lr”†”Œ5optimizer.param_groups.layers.5.ffn_norm.weight.betas”jŽFŒ+param_groups.layers.5.ffn_norm.weight.betas”†”j’CjŽFŒ)param_groups.layers.5.ffn_norm.weight.eps”†”Œoptimizer.param_groups.layers.6.attention.wq.weight.capturable”jŽFŒ4param_groups.layers.6.attention.wq.weight.capturable”†”ŒBoptimizer.param_groups.layers.6.attention.wq.weight.differentiable”jŽFŒ8param_groups.layers.6.attention.wq.weight.differentiable”†”jžCjŽFŒ/param_groups.layers.6.attention.wq.weight.fused”†”Œ>optimizer.param_groups.layers.6.attention.wq.weight.initial_lr”jŽFŒ4param_groups.layers.6.attention.wq.weight.initial_lr”†”Œ6optimizer.param_groups.layers.6.attention.wk.weight.lr”jŽFŒ,param_groups.layers.6.attention.wk.weight.lr”†”j CjŽFŒ/param_groups.layers.6.attention.wk.weight.betas”†”Œ7optimizer.param_groups.layers.6.attention.wk.weight.eps”jŽFŒ-param_groups.layers.6.attention.wk.weight.eps”†”Œ@optimizer.param_groups.layers.6.attention.wk.weight.weight_decay”jŽFŒ6param_groups.layers.6.attention.wk.weight.weight_decay”†”j¢CjŽFŒ1param_groups.layers.6.attention.wk.weight.amsgrad”†”Œ;optimizer.param_groups.layers.6.attention.wk.weight.foreach”jŽFŒ1param_groups.layers.6.attention.wk.weight.foreach”†”Œoptimizer.param_groups.layers.6.attention.wv.weight.capturable”jŽFŒ4param_groups.layers.6.attention.wv.weight.capturable”†”j¬CjŽFŒ8param_groups.layers.6.attention.wv.weight.differentiable”†”Œ9optimizer.param_groups.layers.6.attention.wv.weight.fused”jŽFŒ/param_groups.layers.6.attention.wv.weight.fused”†”Œ>optimizer.param_groups.layers.6.attention.wv.weight.initial_lr”jŽFŒ4param_groups.layers.6.attention.wv.weight.initial_lr”†”j®CjŽFŒ,param_groups.layers.6.attention.wo.weight.lr”†”Œ9optimizer.param_groups.layers.6.attention.wo.weight.betas”jŽFŒ/param_groups.layers.6.attention.wo.weight.betas”†”Œ7optimizer.param_groups.layers.6.attention.wo.weight.eps”jŽFŒ-param_groups.layers.6.attention.wo.weight.eps”†”j°CjŽFŒ6param_groups.layers.6.attention.wo.weight.weight_decay”†”Œ;optimizer.param_groups.layers.6.attention.wo.weight.amsgrad”jŽFŒ1param_groups.layers.6.attention.wo.weight.amsgrad”†”Œ;optimizer.param_groups.layers.6.attention.wo.weight.foreach”jŽFŒ1param_groups.layers.6.attention.wo.weight.foreach”†”j²CjŽFŒ2param_groups.layers.6.attention.wo.weight.maximize”†”Œ>optimizer.param_groups.layers.6.attention.wo.weight.capturable”jŽFŒ4param_groups.layers.6.attention.wo.weight.capturable”†”ŒBoptimizer.param_groups.layers.6.attention.wo.weight.differentiable”jŽFŒ8param_groups.layers.6.attention.wo.weight.differentiable”†”j´CjŽFŒ/param_groups.layers.6.attention.wo.weight.fused”†”Œ>optimizer.param_groups.layers.6.attention.wo.weight.initial_lr”jŽFŒ4param_groups.layers.6.attention.wo.weight.initial_lr”†”Œ9optimizer.param_groups.layers.6.feed_forward.w1.weight.lr”jŽFŒ/param_groups.layers.6.feed_forward.w1.weight.lr”†”j¶CjŽFŒ2param_groups.layers.6.feed_forward.w1.weight.betas”†”Œ:optimizer.param_groups.layers.6.feed_forward.w1.weight.eps”jŽFŒ0param_groups.layers.6.feed_forward.w1.weight.eps”†”ŒCoptimizer.param_groups.layers.6.feed_forward.w1.weight.weight_decay”jŽFŒ9param_groups.layers.6.feed_forward.w1.weight.weight_decay”†”j¸CjŽFŒ4param_groups.layers.6.feed_forward.w1.weight.amsgrad”†”Œ>optimizer.param_groups.layers.6.feed_forward.w1.weight.foreach”jŽFŒ4param_groups.layers.6.feed_forward.w1.weight.foreach”†”Œ?optimizer.param_groups.layers.6.feed_forward.w1.weight.maximize”jŽFŒ5param_groups.layers.6.feed_forward.w1.weight.maximize”†”jºCjŽFŒ7param_groups.layers.6.feed_forward.w1.weight.capturable”†”ŒEoptimizer.param_groups.layers.6.feed_forward.w1.weight.differentiable”jŽFŒ;param_groups.layers.6.feed_forward.w1.weight.differentiable”†”Œoptimizer.param_groups.layers.6.feed_forward.w2.weight.amsgrad”jŽFŒ4param_groups.layers.6.feed_forward.w2.weight.amsgrad”†”jÀCjŽFŒ4param_groups.layers.6.feed_forward.w2.weight.foreach”†”Œ?optimizer.param_groups.layers.6.feed_forward.w2.weight.maximize”jŽFŒ5param_groups.layers.6.feed_forward.w2.weight.maximize”†”ŒAoptimizer.param_groups.layers.6.feed_forward.w2.weight.capturable”jŽFŒ7param_groups.layers.6.feed_forward.w2.weight.capturable”†”jÂCjŽFŒ;param_groups.layers.6.feed_forward.w2.weight.differentiable”†”Œoptimizer.param_groups.layers.6.feed_forward.w3.weight.amsgrad”jŽFŒ4param_groups.layers.6.feed_forward.w3.weight.amsgrad”†”Œ>optimizer.param_groups.layers.6.feed_forward.w3.weight.foreach”jŽFŒ4param_groups.layers.6.feed_forward.w3.weight.foreach”†”jÈCjŽFŒ5param_groups.layers.6.feed_forward.w3.weight.maximize”†”ŒAoptimizer.param_groups.layers.6.feed_forward.w3.weight.capturable”jŽFŒ7param_groups.layers.6.feed_forward.w3.weight.capturable”†”ŒEoptimizer.param_groups.layers.6.feed_forward.w3.weight.differentiable”jŽFŒ;param_groups.layers.6.feed_forward.w3.weight.differentiable”†”jÊCjŽFŒ2param_groups.layers.6.feed_forward.w3.weight.fused”†”ŒAoptimizer.param_groups.layers.6.feed_forward.w3.weight.initial_lr”jŽFŒ7param_groups.layers.6.feed_forward.w3.weight.initial_lr”†”Œ8optimizer.param_groups.layers.6.attention_norm.weight.lr”jŽFŒ.param_groups.layers.6.attention_norm.weight.lr”†”jÌCjŽFŒ1param_groups.layers.6.attention_norm.weight.betas”†”Œ9optimizer.param_groups.layers.6.attention_norm.weight.eps”jŽFŒ/param_groups.layers.6.attention_norm.weight.eps”†”ŒBoptimizer.param_groups.layers.6.attention_norm.weight.weight_decay”jŽFŒ8param_groups.layers.6.attention_norm.weight.weight_decay”†”jÎCjŽFŒ3param_groups.layers.6.attention_norm.weight.amsgrad”†”Œ=optimizer.param_groups.layers.6.attention_norm.weight.foreach”jŽFŒ3param_groups.layers.6.attention_norm.weight.foreach”†”Œ>optimizer.param_groups.layers.6.attention_norm.weight.maximize”jŽFŒ4param_groups.layers.6.attention_norm.weight.maximize”†”jÐCjŽFŒ6param_groups.layers.6.attention_norm.weight.capturable”†”ŒDoptimizer.param_groups.layers.6.attention_norm.weight.differentiable”jŽFŒ:param_groups.layers.6.attention_norm.weight.differentiable”†”Œ;optimizer.param_groups.layers.6.attention_norm.weight.fused”jŽFŒ1param_groups.layers.6.attention_norm.weight.fused”†”jÒCjŽFŒ6param_groups.layers.6.attention_norm.weight.initial_lr”†”Œ2optimizer.param_groups.layers.6.ffn_norm.weight.lr”jŽFŒ(param_groups.layers.6.ffn_norm.weight.lr”†”Œ5optimizer.param_groups.layers.6.ffn_norm.weight.betas”jŽFŒ+param_groups.layers.6.ffn_norm.weight.betas”†”jÔCjŽFŒ)param_groups.layers.6.ffn_norm.weight.eps”†”Œoptimizer.param_groups.layers.7.attention.wq.weight.capturable”jŽFŒ4param_groups.layers.7.attention.wq.weight.capturable”†”ŒBoptimizer.param_groups.layers.7.attention.wq.weight.differentiable”jŽFŒ8param_groups.layers.7.attention.wq.weight.differentiable”†”jàCjŽFŒ/param_groups.layers.7.attention.wq.weight.fused”†”Œ>optimizer.param_groups.layers.7.attention.wq.weight.initial_lr”jŽFŒ4param_groups.layers.7.attention.wq.weight.initial_lr”†”Œ6optimizer.param_groups.layers.7.attention.wk.weight.lr”jŽFŒ,param_groups.layers.7.attention.wk.weight.lr”†”jâCjŽFŒ/param_groups.layers.7.attention.wk.weight.betas”†”Œ7optimizer.param_groups.layers.7.attention.wk.weight.eps”jŽFŒ-param_groups.layers.7.attention.wk.weight.eps”†”Œ@optimizer.param_groups.layers.7.attention.wk.weight.weight_decay”jŽFŒ6param_groups.layers.7.attention.wk.weight.weight_decay”†”jäCjŽFŒ1param_groups.layers.7.attention.wk.weight.amsgrad”†”Œ;optimizer.param_groups.layers.7.attention.wk.weight.foreach”jŽFŒ1param_groups.layers.7.attention.wk.weight.foreach”†”Œoptimizer.param_groups.layers.7.attention.wv.weight.capturable”jŽFŒ4param_groups.layers.7.attention.wv.weight.capturable”†”jîCjŽFŒ8param_groups.layers.7.attention.wv.weight.differentiable”†”Œ9optimizer.param_groups.layers.7.attention.wv.weight.fused”jŽFŒ/param_groups.layers.7.attention.wv.weight.fused”†”Œ>optimizer.param_groups.layers.7.attention.wv.weight.initial_lr”jŽFŒ4param_groups.layers.7.attention.wv.weight.initial_lr”†”jðCjŽFŒ,param_groups.layers.7.attention.wo.weight.lr”†”Œ9optimizer.param_groups.layers.7.attention.wo.weight.betas”jŽFŒ/param_groups.layers.7.attention.wo.weight.betas”†”Œ7optimizer.param_groups.layers.7.attention.wo.weight.eps”jŽFŒ-param_groups.layers.7.attention.wo.weight.eps”†”jòCjŽFŒ6param_groups.layers.7.attention.wo.weight.weight_decay”†”Œ;optimizer.param_groups.layers.7.attention.wo.weight.amsgrad”jŽFŒ1param_groups.layers.7.attention.wo.weight.amsgrad”†”Œ;optimizer.param_groups.layers.7.attention.wo.weight.foreach”jŽFŒ1param_groups.layers.7.attention.wo.weight.foreach”†”jôCjŽFŒ2param_groups.layers.7.attention.wo.weight.maximize”†”Œ>optimizer.param_groups.layers.7.attention.wo.weight.capturable”jŽFŒ4param_groups.layers.7.attention.wo.weight.capturable”†”ŒBoptimizer.param_groups.layers.7.attention.wo.weight.differentiable”jŽFŒ8param_groups.layers.7.attention.wo.weight.differentiable”†”jöCjŽFŒ/param_groups.layers.7.attention.wo.weight.fused”†”Œ>optimizer.param_groups.layers.7.attention.wo.weight.initial_lr”jŽFŒ4param_groups.layers.7.attention.wo.weight.initial_lr”†”Œ9optimizer.param_groups.layers.7.feed_forward.w1.weight.lr”jŽFŒ/param_groups.layers.7.feed_forward.w1.weight.lr”†”jøCjŽFŒ2param_groups.layers.7.feed_forward.w1.weight.betas”†”Œ:optimizer.param_groups.layers.7.feed_forward.w1.weight.eps”jŽFŒ0param_groups.layers.7.feed_forward.w1.weight.eps”†”ŒCoptimizer.param_groups.layers.7.feed_forward.w1.weight.weight_decay”jŽFŒ9param_groups.layers.7.feed_forward.w1.weight.weight_decay”†”júCjŽFŒ4param_groups.layers.7.feed_forward.w1.weight.amsgrad”†”Œ>optimizer.param_groups.layers.7.feed_forward.w1.weight.foreach”jŽFŒ4param_groups.layers.7.feed_forward.w1.weight.foreach”†”Œ?optimizer.param_groups.layers.7.feed_forward.w1.weight.maximize”jŽFŒ5param_groups.layers.7.feed_forward.w1.weight.maximize”†”jüCjŽFŒ7param_groups.layers.7.feed_forward.w1.weight.capturable”†”ŒEoptimizer.param_groups.layers.7.feed_forward.w1.weight.differentiable”jŽFŒ;param_groups.layers.7.feed_forward.w1.weight.differentiable”†”Œoptimizer.param_groups.layers.7.feed_forward.w2.weight.amsgrad”jŽFŒ4param_groups.layers.7.feed_forward.w2.weight.amsgrad”†”jDjŽFŒ4param_groups.layers.7.feed_forward.w2.weight.foreach”†”Œ?optimizer.param_groups.layers.7.feed_forward.w2.weight.maximize”jŽFŒ5param_groups.layers.7.feed_forward.w2.weight.maximize”†”ŒAoptimizer.param_groups.layers.7.feed_forward.w2.weight.capturable”jŽFŒ7param_groups.layers.7.feed_forward.w2.weight.capturable”†”jDjŽFŒ;param_groups.layers.7.feed_forward.w2.weight.differentiable”†”Œoptimizer.param_groups.layers.7.feed_forward.w3.weight.amsgrad”jŽFŒ4param_groups.layers.7.feed_forward.w3.weight.amsgrad”†”Œ>optimizer.param_groups.layers.7.feed_forward.w3.weight.foreach”jŽFŒ4param_groups.layers.7.feed_forward.w3.weight.foreach”†”j DjŽFŒ5param_groups.layers.7.feed_forward.w3.weight.maximize”†”ŒAoptimizer.param_groups.layers.7.feed_forward.w3.weight.capturable”jŽFŒ7param_groups.layers.7.feed_forward.w3.weight.capturable”†”ŒEoptimizer.param_groups.layers.7.feed_forward.w3.weight.differentiable”jŽFŒ;param_groups.layers.7.feed_forward.w3.weight.differentiable”†”j DjŽFŒ2param_groups.layers.7.feed_forward.w3.weight.fused”†”ŒAoptimizer.param_groups.layers.7.feed_forward.w3.weight.initial_lr”jŽFŒ7param_groups.layers.7.feed_forward.w3.weight.initial_lr”†”Œ8optimizer.param_groups.layers.7.attention_norm.weight.lr”jŽFŒ.param_groups.layers.7.attention_norm.weight.lr”†”jDjŽFŒ1param_groups.layers.7.attention_norm.weight.betas”†”Œ9optimizer.param_groups.layers.7.attention_norm.weight.eps”jŽFŒ/param_groups.layers.7.attention_norm.weight.eps”†”ŒBoptimizer.param_groups.layers.7.attention_norm.weight.weight_decay”jŽFŒ8param_groups.layers.7.attention_norm.weight.weight_decay”†”jDjŽFŒ3param_groups.layers.7.attention_norm.weight.amsgrad”†”Œ=optimizer.param_groups.layers.7.attention_norm.weight.foreach”jŽFŒ3param_groups.layers.7.attention_norm.weight.foreach”†”Œ>optimizer.param_groups.layers.7.attention_norm.weight.maximize”jŽFŒ4param_groups.layers.7.attention_norm.weight.maximize”†”jDjŽFŒ6param_groups.layers.7.attention_norm.weight.capturable”†”ŒDoptimizer.param_groups.layers.7.attention_norm.weight.differentiable”jŽFŒ:param_groups.layers.7.attention_norm.weight.differentiable”†”Œ;optimizer.param_groups.layers.7.attention_norm.weight.fused”jŽFŒ1param_groups.layers.7.attention_norm.weight.fused”†”jDjŽFŒ6param_groups.layers.7.attention_norm.weight.initial_lr”†”Œ2optimizer.param_groups.layers.7.ffn_norm.weight.lr”jŽFŒ(param_groups.layers.7.ffn_norm.weight.lr”†”Œ5optimizer.param_groups.layers.7.ffn_norm.weight.betas”jŽFŒ+param_groups.layers.7.ffn_norm.weight.betas”†”jDjŽFŒ)param_groups.layers.7.ffn_norm.weight.eps”†”Œoptimizer.param_groups.layers.8.attention.wq.weight.capturable”jŽFŒ4param_groups.layers.8.attention.wq.weight.capturable”†”ŒBoptimizer.param_groups.layers.8.attention.wq.weight.differentiable”jŽFŒ8param_groups.layers.8.attention.wq.weight.differentiable”†”j"DjŽFŒ/param_groups.layers.8.attention.wq.weight.fused”†”Œ>optimizer.param_groups.layers.8.attention.wq.weight.initial_lr”jŽFŒ4param_groups.layers.8.attention.wq.weight.initial_lr”†”Œ6optimizer.param_groups.layers.8.attention.wk.weight.lr”jŽFŒ,param_groups.layers.8.attention.wk.weight.lr”†”j$DjŽFŒ/param_groups.layers.8.attention.wk.weight.betas”†”Œ7optimizer.param_groups.layers.8.attention.wk.weight.eps”jŽFŒ-param_groups.layers.8.attention.wk.weight.eps”†”Œ@optimizer.param_groups.layers.8.attention.wk.weight.weight_decay”jŽFŒ6param_groups.layers.8.attention.wk.weight.weight_decay”†”j&DjŽFŒ1param_groups.layers.8.attention.wk.weight.amsgrad”†”Œ;optimizer.param_groups.layers.8.attention.wk.weight.foreach”jŽFŒ1param_groups.layers.8.attention.wk.weight.foreach”†”Œoptimizer.param_groups.layers.8.attention.wv.weight.capturable”jŽFŒ4param_groups.layers.8.attention.wv.weight.capturable”†”j0DjŽFŒ8param_groups.layers.8.attention.wv.weight.differentiable”†”Œ9optimizer.param_groups.layers.8.attention.wv.weight.fused”jŽFŒ/param_groups.layers.8.attention.wv.weight.fused”†”Œ>optimizer.param_groups.layers.8.attention.wv.weight.initial_lr”jŽFŒ4param_groups.layers.8.attention.wv.weight.initial_lr”†”j2DjŽFŒ,param_groups.layers.8.attention.wo.weight.lr”†”Œ9optimizer.param_groups.layers.8.attention.wo.weight.betas”jŽFŒ/param_groups.layers.8.attention.wo.weight.betas”†”Œ7optimizer.param_groups.layers.8.attention.wo.weight.eps”jŽFŒ-param_groups.layers.8.attention.wo.weight.eps”†”j4DjŽFŒ6param_groups.layers.8.attention.wo.weight.weight_decay”†”Œ;optimizer.param_groups.layers.8.attention.wo.weight.amsgrad”jŽFŒ1param_groups.layers.8.attention.wo.weight.amsgrad”†”Œ;optimizer.param_groups.layers.8.attention.wo.weight.foreach”jŽFŒ1param_groups.layers.8.attention.wo.weight.foreach”†”j6DjŽFŒ2param_groups.layers.8.attention.wo.weight.maximize”†”Œ>optimizer.param_groups.layers.8.attention.wo.weight.capturable”jŽFŒ4param_groups.layers.8.attention.wo.weight.capturable”†”ŒBoptimizer.param_groups.layers.8.attention.wo.weight.differentiable”jŽFŒ8param_groups.layers.8.attention.wo.weight.differentiable”†”j8DjŽFŒ/param_groups.layers.8.attention.wo.weight.fused”†”Œ>optimizer.param_groups.layers.8.attention.wo.weight.initial_lr”jŽFŒ4param_groups.layers.8.attention.wo.weight.initial_lr”†”Œ9optimizer.param_groups.layers.8.feed_forward.w1.weight.lr”jŽFŒ/param_groups.layers.8.feed_forward.w1.weight.lr”†”j:DjŽFŒ2param_groups.layers.8.feed_forward.w1.weight.betas”†”Œ:optimizer.param_groups.layers.8.feed_forward.w1.weight.eps”jŽFŒ0param_groups.layers.8.feed_forward.w1.weight.eps”†”ŒCoptimizer.param_groups.layers.8.feed_forward.w1.weight.weight_decay”jŽFŒ9param_groups.layers.8.feed_forward.w1.weight.weight_decay”†”joptimizer.param_groups.layers.8.feed_forward.w1.weight.foreach”jŽFŒ4param_groups.layers.8.feed_forward.w1.weight.foreach”†”Œ?optimizer.param_groups.layers.8.feed_forward.w1.weight.maximize”jŽFŒ5param_groups.layers.8.feed_forward.w1.weight.maximize”†”j>DjŽFŒ7param_groups.layers.8.feed_forward.w1.weight.capturable”†”ŒEoptimizer.param_groups.layers.8.feed_forward.w1.weight.differentiable”jŽFŒ;param_groups.layers.8.feed_forward.w1.weight.differentiable”†”Œoptimizer.param_groups.layers.8.feed_forward.w2.weight.amsgrad”jŽFŒ4param_groups.layers.8.feed_forward.w2.weight.amsgrad”†”jDDjŽFŒ4param_groups.layers.8.feed_forward.w2.weight.foreach”†”Œ?optimizer.param_groups.layers.8.feed_forward.w2.weight.maximize”jŽFŒ5param_groups.layers.8.feed_forward.w2.weight.maximize”†”ŒAoptimizer.param_groups.layers.8.feed_forward.w2.weight.capturable”jŽFŒ7param_groups.layers.8.feed_forward.w2.weight.capturable”†”jFDjŽFŒ;param_groups.layers.8.feed_forward.w2.weight.differentiable”†”Œoptimizer.param_groups.layers.8.feed_forward.w3.weight.amsgrad”jŽFŒ4param_groups.layers.8.feed_forward.w3.weight.amsgrad”†”Œ>optimizer.param_groups.layers.8.feed_forward.w3.weight.foreach”jŽFŒ4param_groups.layers.8.feed_forward.w3.weight.foreach”†”jLDjŽFŒ5param_groups.layers.8.feed_forward.w3.weight.maximize”†”ŒAoptimizer.param_groups.layers.8.feed_forward.w3.weight.capturable”jŽFŒ7param_groups.layers.8.feed_forward.w3.weight.capturable”†”ŒEoptimizer.param_groups.layers.8.feed_forward.w3.weight.differentiable”jŽFŒ;param_groups.layers.8.feed_forward.w3.weight.differentiable”†”jNDjŽFŒ2param_groups.layers.8.feed_forward.w3.weight.fused”†”ŒAoptimizer.param_groups.layers.8.feed_forward.w3.weight.initial_lr”jŽFŒ7param_groups.layers.8.feed_forward.w3.weight.initial_lr”†”Œ8optimizer.param_groups.layers.8.attention_norm.weight.lr”jŽFŒ.param_groups.layers.8.attention_norm.weight.lr”†”jPDjŽFŒ1param_groups.layers.8.attention_norm.weight.betas”†”Œ9optimizer.param_groups.layers.8.attention_norm.weight.eps”jŽFŒ/param_groups.layers.8.attention_norm.weight.eps”†”ŒBoptimizer.param_groups.layers.8.attention_norm.weight.weight_decay”jŽFŒ8param_groups.layers.8.attention_norm.weight.weight_decay”†”jRDjŽFŒ3param_groups.layers.8.attention_norm.weight.amsgrad”†”Œ=optimizer.param_groups.layers.8.attention_norm.weight.foreach”jŽFŒ3param_groups.layers.8.attention_norm.weight.foreach”†”Œ>optimizer.param_groups.layers.8.attention_norm.weight.maximize”jŽFŒ4param_groups.layers.8.attention_norm.weight.maximize”†”jTDjŽFŒ6param_groups.layers.8.attention_norm.weight.capturable”†”ŒDoptimizer.param_groups.layers.8.attention_norm.weight.differentiable”jŽFŒ:param_groups.layers.8.attention_norm.weight.differentiable”†”Œ;optimizer.param_groups.layers.8.attention_norm.weight.fused”jŽFŒ1param_groups.layers.8.attention_norm.weight.fused”†”jVDjŽFŒ6param_groups.layers.8.attention_norm.weight.initial_lr”†”Œ2optimizer.param_groups.layers.8.ffn_norm.weight.lr”jŽFŒ(param_groups.layers.8.ffn_norm.weight.lr”†”Œ5optimizer.param_groups.layers.8.ffn_norm.weight.betas”jŽFŒ+param_groups.layers.8.ffn_norm.weight.betas”†”jXDjŽFŒ)param_groups.layers.8.ffn_norm.weight.eps”†”Œoptimizer.param_groups.layers.9.attention.wq.weight.capturable”jŽFŒ4param_groups.layers.9.attention.wq.weight.capturable”†”ŒBoptimizer.param_groups.layers.9.attention.wq.weight.differentiable”jŽFŒ8param_groups.layers.9.attention.wq.weight.differentiable”†”jdDjŽFŒ/param_groups.layers.9.attention.wq.weight.fused”†”Œ>optimizer.param_groups.layers.9.attention.wq.weight.initial_lr”jŽFŒ4param_groups.layers.9.attention.wq.weight.initial_lr”†”Œ6optimizer.param_groups.layers.9.attention.wk.weight.lr”jŽFŒ,param_groups.layers.9.attention.wk.weight.lr”†”jfDjŽFŒ/param_groups.layers.9.attention.wk.weight.betas”†”Œ7optimizer.param_groups.layers.9.attention.wk.weight.eps”jŽFŒ-param_groups.layers.9.attention.wk.weight.eps”†”Œ@optimizer.param_groups.layers.9.attention.wk.weight.weight_decay”jŽFŒ6param_groups.layers.9.attention.wk.weight.weight_decay”†”jhDjŽFŒ1param_groups.layers.9.attention.wk.weight.amsgrad”†”Œ;optimizer.param_groups.layers.9.attention.wk.weight.foreach”jŽFŒ1param_groups.layers.9.attention.wk.weight.foreach”†”Œoptimizer.param_groups.layers.9.attention.wv.weight.capturable”jŽFŒ4param_groups.layers.9.attention.wv.weight.capturable”†”jrDjŽFŒ8param_groups.layers.9.attention.wv.weight.differentiable”†”Œ9optimizer.param_groups.layers.9.attention.wv.weight.fused”jŽFŒ/param_groups.layers.9.attention.wv.weight.fused”†”Œ>optimizer.param_groups.layers.9.attention.wv.weight.initial_lr”jŽFŒ4param_groups.layers.9.attention.wv.weight.initial_lr”†”jtDjŽFŒ,param_groups.layers.9.attention.wo.weight.lr”†”Œ9optimizer.param_groups.layers.9.attention.wo.weight.betas”•jŽFŒ/param_groups.layers.9.attention.wo.weight.betas”†”Œ7optimizer.param_groups.layers.9.attention.wo.weight.eps”jŽFŒ-param_groups.layers.9.attention.wo.weight.eps”†”jvDjŽFŒ6param_groups.layers.9.attention.wo.weight.weight_decay”†”Œ;optimizer.param_groups.layers.9.attention.wo.weight.amsgrad”jŽFŒ1param_groups.layers.9.attention.wo.weight.amsgrad”†”Œ;optimizer.param_groups.layers.9.attention.wo.weight.foreach”jŽFŒ1param_groups.layers.9.attention.wo.weight.foreach”†”jxDjŽFŒ2param_groups.layers.9.attention.wo.weight.maximize”†”Œ>optimizer.param_groups.layers.9.attention.wo.weight.capturable”jŽFŒ4param_groups.layers.9.attention.wo.weight.capturable”†”ŒBoptimizer.param_groups.layers.9.attention.wo.weight.differentiable”jŽFŒ8param_groups.layers.9.attention.wo.weight.differentiable”†”jzDjŽFŒ/param_groups.layers.9.attention.wo.weight.fused”†”Œ>optimizer.param_groups.layers.9.attention.wo.weight.initial_lr”jŽFŒ4param_groups.layers.9.attention.wo.weight.initial_lr”†”Œ9optimizer.param_groups.layers.9.feed_forward.w1.weight.lr”jŽFŒ/param_groups.layers.9.feed_forward.w1.weight.lr”†”j|DjŽFŒ2param_groups.layers.9.feed_forward.w1.weight.betas”†”Œ:optimizer.param_groups.layers.9.feed_forward.w1.weight.eps”jŽFŒ0param_groups.layers.9.feed_forward.w1.weight.eps”†”ŒCoptimizer.param_groups.layers.9.feed_forward.w1.weight.weight_decay”jŽFŒ9param_groups.layers.9.feed_forward.w1.weight.weight_decay”†”j~DjŽFŒ4param_groups.layers.9.feed_forward.w1.weight.amsgrad”†”Œ>optimizer.param_groups.layers.9.feed_forward.w1.weight.foreach”jŽFŒ4param_groups.layers.9.feed_forward.w1.weight.foreach”†”Œ?optimizer.param_groups.layers.9.feed_forward.w1.weight.maximize”jŽFŒ5param_groups.layers.9.feed_forward.w1.weight.maximize”†”j€DjŽFŒ7param_groups.layers.9.feed_forward.w1.weight.capturable”†”ŒEoptimizer.param_groups.layers.9.feed_forward.w1.weight.differentiable”jŽFŒ;param_groups.layers.9.feed_forward.w1.weight.differentiable”†”Œoptimizer.param_groups.layers.9.feed_forward.w2.weight.amsgrad”jŽFŒ4param_groups.layers.9.feed_forward.w2.weight.amsgrad”†”j†DjŽFŒ4param_groups.layers.9.feed_forward.w2.weight.foreach”†”Œ?optimizer.param_groups.layers.9.feed_forward.w2.weight.maximize”jŽFŒ5param_groups.layers.9.feed_forward.w2.weight.maximize”†”ŒAoptimizer.param_groups.layers.9.feed_forward.w2.weight.capturable”jŽFŒ7param_groups.layers.9.feed_forward.w2.weight.capturable”†”jˆDjŽFŒ;param_groups.layers.9.feed_forward.w2.weight.differentiable”†”Œoptimizer.param_groups.layers.9.feed_forward.w3.weight.amsgrad”jŽFŒ4param_groups.layers.9.feed_forward.w3.weight.amsgrad”†”Œ>optimizer.param_groups.layers.9.feed_forward.w3.weight.foreach”jŽFŒ4param_groups.layers.9.feed_forward.w3.weight.foreach”†”jŽDjŽFŒ5param_groups.layers.9.feed_forward.w3.weight.maximize”†”ŒAoptimizer.param_groups.layers.9.feed_forward.w3.weight.capturable”jŽFŒ7param_groups.layers.9.feed_forward.w3.weight.capturable”†”ŒEoptimizer.param_groups.layers.9.feed_forward.w3.weight.differentiable”jŽFŒ;param_groups.layers.9.feed_forward.w3.weight.differentiable”†”jDjŽFŒ2param_groups.layers.9.feed_forward.w3.weight.fused”†”ŒAoptimizer.param_groups.layers.9.feed_forward.w3.weight.initial_lr”jŽFŒ7param_groups.layers.9.feed_forward.w3.weight.initial_lr”†”Œ8optimizer.param_groups.layers.9.attention_norm.weight.lr”jŽFŒ.param_groups.layers.9.attention_norm.weight.lr”†”j’DjŽFŒ1param_groups.layers.9.attention_norm.weight.betas”†”Œ9optimizer.param_groups.layers.9.attention_norm.weight.eps”jŽFŒ/param_groups.layers.9.attention_norm.weight.eps”†”ŒBoptimizer.param_groups.layers.9.attention_norm.weight.weight_decay”jŽFŒ8param_groups.layers.9.attention_norm.weight.weight_decay”†”j”DjŽFŒ3param_groups.layers.9.attention_norm.weight.amsgrad”†”Œ=optimizer.param_groups.layers.9.attention_norm.weight.foreach”jŽFŒ3param_groups.layers.9.attention_norm.weight.foreach”†”Œ>optimizer.param_groups.layers.9.attention_norm.weight.maximize”jŽFŒ4param_groups.layers.9.attention_norm.weight.maximize”†”j–DjŽFŒ6param_groups.layers.9.attention_norm.weight.capturable”†”ŒDoptimizer.param_groups.layers.9.attention_norm.weight.differentiable”jŽFŒ:param_groups.layers.9.attention_norm.weight.differentiable”†”Œ;optimizer.param_groups.layers.9.attention_norm.weight.fused”jŽFŒ1param_groups.layers.9.attention_norm.weight.fused”†”j˜DjŽFŒ6param_groups.layers.9.attention_norm.weight.initial_lr”†”Œ2optimizer.param_groups.layers.9.ffn_norm.weight.lr”jŽFŒ(param_groups.layers.9.ffn_norm.weight.lr”†”Œ5optimizer.param_groups.layers.9.ffn_norm.weight.betas”jŽFŒ+param_groups.layers.9.ffn_norm.weight.betas”†”jšDjŽFŒ)param_groups.layers.9.ffn_norm.weight.eps”†”Œoptimizer.param_groups.layers.10.attention_norm.weight.foreach”jŽFŒ4param_groups.layers.10.attention_norm.weight.foreach”†”Œ?optimizer.param_groups.layers.10.attention_norm.weight.maximize”jŽFŒ5param_groups.layers.10.attention_norm.weight.maximize”†”jØDjŽFŒ7param_groups.layers.10.attention_norm.weight.capturable”†”ŒEoptimizer.param_groups.layers.10.attention_norm.weight.differentiable”jŽFŒ;param_groups.layers.10.attention_norm.weight.differentiable”†”Œoptimizer.param_groups.layers.11.attention_norm.weight.foreach”jŽFŒ4param_groups.layers.11.attention_norm.weight.foreach”†”Œ?optimizer.param_groups.layers.11.attention_norm.weight.maximize”jŽFŒ5param_groups.layers.11.attention_norm.weight.maximize”†”jEjŽFŒ7param_groups.layers.11.attention_norm.weight.capturable”†”ŒEoptimizer.param_groups.layers.11.attention_norm.weight.differentiable”jŽFŒ;param_groups.layers.11.attention_norm.weight.differentiable”†”Œj6WMœubj%W)”}”(j(WŒ9optimizer.state.layers.3.attention_norm.weight.exp_avg_sq”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJ(Gj6WMœubj%W)”}”(j(WŒ0optimizer.state.layers.3.ffn_norm.weight.exp_avg”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJÄOj6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.3.ffn_norm.weight.exp_avg_sq”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJ`Xj6WMœubj%W)”}”(j(WŒ6optimizer.state.layers.4.attention_norm.weight.exp_avg”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJü`j6WMœubj%W)”}”(j(WŒ9optimizer.state.layers.4.attention_norm.weight.exp_avg_sq”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJ˜ij6WMœubj%W)”}”(j(WŒ0optimizer.state.layers.4.ffn_norm.weight.exp_avg”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJ4rj6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.4.ffn_norm.weight.exp_avg_sq”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJÐzj6WMœubj%W)”}”(j(WŒ6optimizer.state.layers.5.attention_norm.weight.exp_avg”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJlƒj6WMœubj%W)”}”(j(WŒ9optimizer.state.layers.5.attention_norm.weight.exp_avg_sq”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJŒj6WMœubj%W)”}”(j(WŒ0optimizer.state.layers.5.ffn_norm.weight.exp_avg”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJ¤”j6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.5.ffn_norm.weight.exp_avg_sq”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJ@j6WMœubj%W)”}”(j(WŒ6optimizer.state.layers.6.attention_norm.weight.exp_avg”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJÜ¥j6WMœubj%W)”}”(j(WŒ9optimizer.state.layers.6.attention_norm.weight.exp_avg_sq”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJx®j6WMœubj%W)”}”(j(WŒ0optimizer.state.layers.6.ffn_norm.weight.exp_avg”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJ·j6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.6.ffn_norm.weight.exp_avg_sq”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJ°¿j6WMœubj%W)”}”(j(WŒ6optimizer.state.layers.7.attention_norm.weight.exp_avg”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJLÈj6WMœubj%W)”}”(j(WŒ9optimizer.state.layers.7.attention_norm.weight.exp_avg_sq”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJèÐj6WMœubj%W)”}”(j(WŒ0optimizer.state.layers.7.ffn_norm.weight.exp_avg”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJ„Ùj6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.7.ffn_norm.weight.exp_avg_sq”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJ âj6WMœubj%W)”}”(j(WŒ6optimizer.state.layers.8.attention_norm.weight.exp_avg”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJ¼êj6WMœubj%W)”}”(j(WŒ9optimizer.state.layers.8.attention_norm.weight.exp_avg_sq”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJXój6WMœubj%W)”}”(j(WŒ0optimizer.state.layers.8.ffn_norm.weight.exp_avg”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJôûj6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.8.ffn_norm.weight.exp_avg_sq”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJj6WMœubj%W)”}”(j(WŒ6optimizer.state.layers.9.attention_norm.weight.exp_avg”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJ, j6WMœubj%W)”}”(j(WŒ9optimizer.state.layers.9.attention_norm.weight.exp_avg_sq”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJÈj6WMœubj%W)”}”(j(WŒ0optimizer.state.layers.9.ffn_norm.weight.exp_avg”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJdj6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.9.ffn_norm.weight.exp_avg_sq”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJ'j6WMœubj%W)”}”(j(WŒ7optimizer.state.layers.10.attention_norm.weight.exp_avg”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJœ/j6WMœubj%W)”}”(j(WŒ:optimizer.state.layers.10.attention_norm.weight.exp_avg_sq”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJ88j6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.10.ffn_norm.weight.exp_avg”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJÔ@j6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.10.ffn_norm.weight.exp_avg_sq”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJpIj6WMœubj%W)”}”(j(WŒ7optimizer.state.layers.11.attention_norm.weight.exp_avg”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJ Rj6WMœubj%W)”}”(j(WŒ:optimizer.state.layers.11.attention_norm.weight.exp_avg_sq”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJ¨Zj6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.11.ffn_norm.weight.exp_avg”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJDcj6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.11.ffn_norm.weight.exp_avg_sq”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJàkj6WMœubj%W)”}”(j(WŒ#optimizer.state.norm.weight.exp_avg”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJ|tj6WMœubj%W)”}”(j(WŒ&optimizer.state.norm.weight.exp_avg_sq”j*WKj+Wh!K…”…”R”ubj1W)”}”(j4Wj5Wj+WJ}j6WMœubj%W)”}”(j(WŒmodel.freqs_cis”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ´…j6WJœubj%W)”}”(j(WŒ"model.layers.0.attention.wq.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJPŠj6WJœubj%W)”}”(j(WŒ"model.layers.0.attention.wk.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJìŽ"j6WJœubj%W)”}”(j(WŒ"model.layers.0.attention.wv.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJˆ“2j6WJœubj%W)”}”(j(WŒ"model.layers.0.attention.wo.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ$˜Bj6WJœubj%W)”}”(j(WŒ"model.layers.1.attention.wq.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÀœRj6WJœubj%W)”}”(j(WŒ"model.layers.1.attention.wk.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ\¡bj6WJœubj%W)”}”(j(WŒ"model.layers.1.attention.wv.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJø¥rj6WJœubj%W)”}”(j(WŒ"model.layers.1.attention.wo.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ”ª‚j6WJœubj%W)”}”(j(WŒ"model.layers.2.attention.wq.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ0¯’j6WJœubj%W)”}”(j(WŒ"model.layers.2.attention.wk.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ̳¢j6WJœubj%W)”}”(j(WŒ"model.layers.2.attention.wv.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJh¸²j6WJœubj%W)”}”(j(WŒ"model.layers.2.attention.wo.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ½Âj6WJœubj%W)”}”(j(WŒ"model.layers.3.attention.wq.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ ÁÒj6WJœubj%W)”}”(j(WŒ"model.layers.3.attention.wk.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ<Æâj6WJœubj%W)”}”(j(WŒ"model.layers.3.attention.wv.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJØÊòj6WJœubj%W)”}”(j(WŒ"model.layers.3.attention.wo.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJtÏj6WJœubj%W)”}”(j(WŒ"model.layers.4.attention.wq.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÔj6WJœubj%W)”}”(j(WŒ"model.layers.4.attention.wk.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¬Ø"j6WJœubj%W)”}”(j(WŒ"model.layers.4.attention.wv.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJHÝ2j6WJœubj%W)”}”(j(WŒ"model.layers.4.attention.wo.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJäáBj6WJœubj%W)”}”(j(WŒ"model.layers.5.attention.wq.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ€æRj6WJœubj%W)”}”(j(WŒ"model.layers.5.attention.wk.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJëbj6WJœubj%W)”}”(j(WŒ"model.layers.5.attention.wv.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¸ïrj6WJœubj%W)”}”(j(WŒ"model.layers.5.attention.wo.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJTô‚j6WJœubj%W)”}”(j(WŒ"model.layers.6.attention.wq.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJðø’j6WJœubj%W)”}”(j(WŒ"model.layers.6.attention.wk.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJŒý¢j6WJœubj%W)”}”(j(WŒ"model.layers.6.attention.wv.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ(³j6WJœubj%W)”}”(j(WŒ"model.layers.6.attention.wo.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÄÃj6WJœubj%W)”}”(j(WŒ"model.layers.7.attention.wq.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ` Ój6WJœubj%W)”}”(j(WŒ"model.layers.7.attention.wk.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJüãj6WJœubj%W)”}”(j(WŒ"model.layers.7.attention.wv.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ˜ój6WJœubj%W)”}”(j(WŒ"model.layers.7.attention.wo.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ4j6WJœubj%W)”}”(j(WŒ"model.layers.8.attention.wq.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÐj6WJœubj%W)”}”(j(WŒ"model.layers.8.attention.wk.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJl"#j6WJœubj%W)”}”(j(WŒ"model.layers.8.attention.wv.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ'3j6WJœubj%W)”}”(j(WŒ"model.layers.8.attention.wo.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¤+Cj6WJœubj%W)”}”(j(WŒ"model.layers.9.attention.wq.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ@0Sj6WJœubj%W)”}”(j(WŒ"model.layers.9.attention.wk.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÜ4cj6WJœubj%W)”}”(j(WŒ"model.layers.9.attention.wv.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJx9sj6WJœubj%W)”}”(j(WŒ"model.layers.9.attention.wo.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ>ƒj6WJœubj%W)”}”(j(WŒ#model.layers.10.attention.wq.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ°B“j6WJœubj%W)”}”(j(WŒ#model.layers.10.attention.wk.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJLG£j6WJœubj%W)”}”(j(WŒ#model.layers.10.attention.wv.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJèK³j6WJœubj%W)”}”(j(WŒ#model.layers.10.attention.wo.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ„PÃj6WJœubj%W)”}”(j(WŒ#model.layers.11.attention.wq.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ UÓj6WJœubj%W)”}”(j(WŒ#model.layers.11.attention.wk.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¼Yãj6WJœubj%W)”}”(j(WŒ#model.layers.11.attention.wv.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJX^ój6WJœubj%W)”}”(j(WŒ#model.layers.11.attention.wo.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJôbj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.0.attention.wq.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJgj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.0.attention.wq.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ,l#j6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.0.attention.wk.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÈp3j6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.0.attention.wk.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJduCj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.0.attention.wv.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJzSj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.0.attention.wv.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJœ~cj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.0.attention.wo.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ8ƒsj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.0.attention.wo.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÔ‡ƒj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.1.attention.wq.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJpŒ“j6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.1.attention.wq.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ ‘£j6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.1.attention.wk.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¨•³j6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.1.attention.wk.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJDšÃj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.1.attention.wv.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJàžÓj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.1.attention.wv.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ|£ãj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.1.attention.wo.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¨ój6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.1.attention.wo.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ´¬j6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.2.attention.wq.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJP±j6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.2.attention.wq.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJìµ#j6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.2.attention.wk.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJˆº3j6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.2.attention.wk.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ$¿Cj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.2.attention.wv.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÀÃSj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.2.attention.wv.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ\Ècj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.2.attention.wo.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJøÌsj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.2.attention.wo.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ”уj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.3.attention.wq.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ0Ö“j6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.3.attention.wq.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÌÚ£j6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.3.attention.wk.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJhß³j6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.3.attention.wk.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJäÃj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.3.attention.wv.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ èÓj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.3.attention.wv.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ<íãj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.3.attention.wo.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJØñój6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.3.attention.wo.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJtöj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.4.attention.wq.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJûj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.4.attention.wq.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¬ÿ#j6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.4.attention.wk.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJH4j6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.4.attention.wk.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJäDj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.4.attention.wv.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ€ Tj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.4.attention.wv.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJdj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.4.attention.wo.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¸tj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.4.attention.wo.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJT„j6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.5.attention.wq.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJð”j6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.5.attention.wq.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJŒ$¤j6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.5.attention.wk.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ()´j6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.5.attention.wk.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÄ-Äj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.5.attention.wv.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ`2Ôj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.5.attention.wv.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJü6äj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.5.attention.wo.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ˜;ôj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.5.attention.wo.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ4@j6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.6.attention.wq.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÐDj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.6.attention.wq.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJlI$j6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.6.attention.wk.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJN4j6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.6.attention.wk.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¤RDj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.6.attention.wv.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ@WTj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.6.attention.wv.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÜ[dj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.6.attention.wo.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJx`tj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.6.attention.wo.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJe„j6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.7.attention.wq.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ°i”j6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.7.attention.wq.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJLn¤j6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.7.attention.wk.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJèr´j6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.7.attention.wk.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ„wÄj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.7.attention.wv.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ |Ôj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.7.attention.wv.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¼€äj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.7.attention.wo.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJX…ôj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.7.attention.wo.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJô‰j6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.8.attention.wq.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJŽj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.8.attention.wq.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ,“$j6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.8.attention.wk.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÈ—4j6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.8.attention.wk.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJdœDj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.8.attention.wv.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¡Tj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.8.attention.wv.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJœ¥dj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.8.attention.wo.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ8ªtj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.8.attention.wo.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÔ®„j6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.9.attention.wq.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJp³”j6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.9.attention.wq.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ ¸¤j6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.9.attention.wk.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¨¼´j6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.9.attention.wk.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJDÁÄj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.9.attention.wv.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJàÅÔj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.9.attention.wv.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ|Êäj6WJœubj%W)”}”(j(WŒ4optimizer.state.layers.9.attention.wo.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÏôj6WJœubj%W)”}”(j(WŒ7optimizer.state.layers.9.attention.wo.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ´Ój6WJœubj%W)”}”(j(WŒ5optimizer.state.layers.10.attention.wq.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJPØj6WJœubj%W)”}”(j(WŒ8optimizer.state.layers.10.attention.wq.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJìÜ$j6WJœubj%W)”}”(j(WŒ5optimizer.state.layers.10.attention.wk.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJˆá4j6WJœubj%W)”}”(j(WŒ8optimizer.state.layers.10.attention.wk.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ$æDj6WJœubj%W)”}”(j(WŒ5optimizer.state.layers.10.attention.wv.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÀêTj6WJœubj%W)”}”(j(WŒ8optimizer.state.layers.10.attention.wv.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ\ïdj6WJœubj%W)”}”(j(WŒ5optimizer.state.layers.10.attention.wo.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJøótj6WJœubj%W)”}”(j(WŒ8optimizer.state.layers.10.attention.wo.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ”ø„j6WJœubj%W)”}”(j(WŒ5optimizer.state.layers.11.attention.wq.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ0ý”j6WJœubj%W)”}”(j(WŒ8optimizer.state.layers.11.attention.wq.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÌ¥j6WJœubj%W)”}”(j(WŒ5optimizer.state.layers.11.attention.wk.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJhµj6WJœubj%W)”}”(j(WŒ8optimizer.state.layers.11.attention.wk.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ Åj6WJœubj%W)”}”(j(WŒ5optimizer.state.layers.11.attention.wv.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ Õj6WJœubj%W)”}”(j(WŒ8optimizer.state.layers.11.attention.wv.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ<åj6WJœubj%W)”}”(j(WŒ5optimizer.state.layers.11.attention.wo.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJØõj6WJœubj%W)”}”(j(WŒ8optimizer.state.layers.11.attention.wo.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJt j6WJœubj%W)”}”(j(WŒ%model.layers.0.feed_forward.w1.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ" j6WJœ,ubj%W)”}”(j(WŒ%model.layers.0.feed_forward.w2.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¬&A j6WJœ,ubj%W)”}”(j(WŒ%model.layers.0.feed_forward.w3.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJH+m j6WJœ,ubj%W)”}”(j(WŒ%model.layers.1.feed_forward.w1.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJä/™ j6WJœ,ubj%W)”}”(j(WŒ%model.layers.1.feed_forward.w2.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ€4Å j6WJœ,ubj%W)”}”(j(WŒ%model.layers.1.feed_forward.w3.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ9ñ j6WJœ,ubj%W)”}”(j(WŒ%model.layers.2.feed_forward.w1.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¸= j6WJœ,ubj%W)”}”(j(WŒ%model.layers.2.feed_forward.w2.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJTBI j6WJœ,ubj%W)”}”(j(WŒ%model.layers.2.feed_forward.w3.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJðFu j6WJœ,ubj%W)”}”(j(WŒ%model.layers.3.feed_forward.w1.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJŒK¡ j6WJœ,ubj%W)”}”(j(WŒ%model.layers.3.feed_forward.w2.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ(PÍ j6WJœ,ubj%W)”}”(j(WŒ%model.layers.3.feed_forward.w3.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÄTù j6WJœ,ubj%W)”}”(j(WŒ%model.layers.4.feed_forward.w1.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ`Y% j6WJœ,ubj%W)”}”(j(WŒ%model.layers.4.feed_forward.w2.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJü]Q j6WJœ,ubj%W)”}”(j(WŒ%model.layers.4.feed_forward.w3.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ˜b} j6WJœ,ubj%W)”}”(j(WŒ%model.layers.5.feed_forward.w1.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ4g© j6WJœ,ubj%W)”}”(j(WŒ%model.layers.5.feed_forward.w2.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÐkÕ j6WJœ,ubj%W)”}”(j(WŒ%model.layers.5.feed_forward.w3.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJlp j6WJœ,ubj%W)”}”(j(WŒ%model.layers.6.feed_forward.w1.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJu- j6WJœ,ubj%W)”}”(j(WŒ%model.layers.6.feed_forward.w2.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¤yY j6WJœ,ubj%W)”}”(j(WŒ%model.layers.6.feed_forward.w3.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ@~… j6WJœ,ubj%W)”}”(j(WŒ%model.layers.7.feed_forward.w1.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÜ‚± j6WJœ,ubj%W)”}”(j(WŒ%model.layers.7.feed_forward.w2.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJx‡Ý j6WJœ,ubj%W)”}”(j(WŒ%model.layers.7.feed_forward.w3.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJŒ j6WJœ,ubj%W)”}”(j(WŒ%model.layers.8.feed_forward.w1.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ°5 j6WJœ,ubj%W)”}”(j(WŒ%model.layers.8.feed_forward.w2.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJL•a j6WJœ,ubj%W)”}”(j(WŒ%model.layers.8.feed_forward.w3.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJè™ j6WJœ,ubj%W)”}”(j(WŒ%model.layers.9.feed_forward.w1.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ„ž¹ j6WJœ,ubj%W)”}”(j(WŒ%model.layers.9.feed_forward.w2.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ £å j6WJœ,ubj%W)”}”(j(WŒ%model.layers.9.feed_forward.w3.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¼§j6WJœ,ubj%W)”}”(j(WŒ&model.layers.10.feed_forward.w1.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJX¬=j6WJœ,ubj%W)”}”(j(WŒ&model.layers.10.feed_forward.w2.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJô°ij6WJœ,ubj%W)”}”(j(WŒ&model.layers.10.feed_forward.w3.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJµ•j6WJœ,ubj%W)”}”(j(WŒ&model.layers.11.feed_forward.w1.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ,ºÁj6WJœ,ubj%W)”}”(j(WŒ&model.layers.11.feed_forward.w2.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJȾíj6WJœ,ubj%W)”}”(j(WŒ&model.layers.11.feed_forward.w3.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJdÃj6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.0.feed_forward.w1.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÈEj6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.0.feed_forward.w1.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJœÌqj6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.0.feed_forward.w2.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ8Ñj6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.0.feed_forward.w2.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÔÕÉj6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.0.feed_forward.w3.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJpÚõj6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.0.feed_forward.w3.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ ß!j6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.1.feed_forward.w1.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¨ãMj6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.1.feed_forward.w1.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJDèyj6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.1.feed_forward.w2.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJàì¥j6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.1.feed_forward.w2.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ|ñÑj6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.1.feed_forward.w3.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJöýj6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.1.feed_forward.w3.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ´ú)j6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.2.feed_forward.w1.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJPÿUj6WJœ,ubj%W)”}”(j(W•Œ:optimizer.state.layers.2.feed_forward.w1.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJì‚j6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.2.feed_forward.w2.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJˆ®j6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.2.feed_forward.w2.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ$ Új6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.2.feed_forward.w3.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÀj6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.2.feed_forward.w3.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ\2j6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.3.feed_forward.w1.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJø^j6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.3.feed_forward.w1.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ”Šj6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.3.feed_forward.w2.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ0$¶j6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.3.feed_forward.w2.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÌ(âj6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.3.feed_forward.w3.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJh-j6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.3.feed_forward.w3.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ2:j6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.4.feed_forward.w1.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ 6fj6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.4.feed_forward.w1.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ<;’j6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.4.feed_forward.w2.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJØ?¾j6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.4.feed_forward.w2.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJtDêj6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.4.feed_forward.w3.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJIj6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.4.feed_forward.w3.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¬MBj6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.5.feed_forward.w1.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJHRnj6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.5.feed_forward.w1.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJäVšj6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.5.feed_forward.w2.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ€[Æj6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.5.feed_forward.w2.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ`òj6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.5.feed_forward.w3.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¸dj6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.5.feed_forward.w3.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJTiJj6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.6.feed_forward.w1.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJðmvj6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.6.feed_forward.w1.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJŒr¢j6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.6.feed_forward.w2.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ(wÎj6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.6.feed_forward.w2.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÄ{új6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.6.feed_forward.w3.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ`€&j6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.6.feed_forward.w3.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJü„Rj6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.7.feed_forward.w1.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ˜‰~j6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.7.feed_forward.w1.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ4Žªj6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.7.feed_forward.w2.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÐ’Öj6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.7.feed_forward.w2.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJl—j6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.7.feed_forward.w3.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJœ.j6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.7.feed_forward.w3.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¤ Zj6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.8.feed_forward.w1.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ@¥†j6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.8.feed_forward.w1.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÜ©²j6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.8.feed_forward.w2.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJx®Þj6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.8.feed_forward.w2.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ³ j6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.8.feed_forward.w3.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ°·6j6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.8.feed_forward.w3.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJL¼bj6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.9.feed_forward.w1.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJèÀŽj6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.9.feed_forward.w1.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ„źj6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.9.feed_forward.w2.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ Êæj6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.9.feed_forward.w2.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¼Îj6WJœ,ubj%W)”}”(j(WŒ7optimizer.state.layers.9.feed_forward.w3.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJXÓ>j6WJœ,ubj%W)”}”(j(WŒ:optimizer.state.layers.9.feed_forward.w3.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJô×jj6WJœ,ubj%W)”}”(j(WŒ8optimizer.state.layers.10.feed_forward.w1.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÜ–j6WJœ,ubj%W)”}”(j(WŒ;optimizer.state.layers.10.feed_forward.w1.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ,áÂj6WJœ,ubj%W)”}”(j(WŒ8optimizer.state.layers.10.feed_forward.w2.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÈåîj6WJœ,ubj%W)”}”(j(WŒ;optimizer.state.layers.10.feed_forward.w2.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJdêj6WJœ,ubj%W)”}”(j(WŒ8optimizer.state.layers.10.feed_forward.w3.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJïFj6WJœ,ubj%W)”}”(j(WŒ;optimizer.state.layers.10.feed_forward.w3.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJœórj6WJœ,ubj%W)”}”(j(WŒ8optimizer.state.layers.11.feed_forward.w1.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ8øžj6WJœ,ubj%W)”}”(j(WŒ;optimizer.state.layers.11.feed_forward.w1.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJÔüÊj6WJœ,ubj%W)”}”(j(WŒ8optimizer.state.layers.11.feed_forward.w2.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJp÷j6WJœ,ubj%W)”}”(j(WŒ;optimizer.state.layers.11.feed_forward.w2.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ #j6WJœ,ubj%W)”}”(j(WŒ8optimizer.state.layers.11.feed_forward.w3.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ¨ Oj6WJœ,ubj%W)”}”(j(WŒ;optimizer.state.layers.11.feed_forward.w3.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJD{j6WJœ,ubj%W)”}”(j(WŒmodel.tok_embeddings.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJà§j6WJœôubj%W)”}”(j(WŒmodel.output.weight”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ|›j6WJœôubj%W)”}”(j(WŒ-optimizer.state.tok_embeddings.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJj6WJœôubj%W)”}”(j(WŒ0optimizer.state.tok_embeddings.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJ´!ƒ!j6WJœôubj%W)”}”(j(WŒ%optimizer.state.output.weight.exp_avg”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJP&w#j6WJœôubj%W)”}”(j(WŒ(optimizer.state.output.weight.exp_avg_sq”j*WKj+Wh!KK†”…”R”ubj1W)”}”(j4Wj5Wj+WJì*k%j6WJœôubj%W)”}”(j(WŒ/optimizer.param_groups.tok_embeddings.weight.lr”j*WNubj1W)”}”(j4WŒ __1_0.distcp”j+WKj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.tok_embeddings.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WM`j6WM`ubj%W)”}”(j(WŒ5optimizer.param_groups.tok_embeddings.weight.maximize”j*WNubj1W)”}”(j4Wj¤aj+WMÀj6WM`ubj%W)”}”(j(WŒ2optimizer.param_groups.tok_embeddings.weight.fused”j*WNubj1W)”}”(j4Wj¤aj+WM j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.0.attention.wq.weight.betas”j*WNubj1W)”}”(j4Wj¤aj+WM€ j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.0.attention.wq.weight.amsgrad”j*WNubj1W)”}”(j4Wj¤aj+WMàj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.0.attention.wq.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WM@j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.0.attention.wq.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WM j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.0.attention.wk.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WMj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.0.attention.wk.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WM`j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.0.attention.wk.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WMÀ!j6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.0.attention.wv.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WM %j6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.0.attention.wv.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WM€(j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.0.attention.wo.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WM`9j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.0.attention.wo.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WMÀoptimizer.param_groups.layers.0.feed_forward.w1.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WM€Cj6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.0.feed_forward.w1.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WMàFj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.0.feed_forward.w2.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WM@Jj6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.0.feed_forward.w2.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WM Mj6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.0.feed_forward.w2.weight.maximize”j*WNubj1W)”}”(j4Wj¤aj+WMQj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.0.feed_forward.w3.weight.amsgrad”j*WNubj1W)”}”(j4Wj¤aj+WM [j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.0.feed_forward.w3.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WM€^j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.0.feed_forward.w3.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WMàaj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.0.attention_norm.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WM@ej6WM`ubj%W)”}”(j(WŒ=optimizer.param_groups.layers.0.attention_norm.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WM hj6WM`ubj%W)”}”(j(WŒDoptimizer.param_groups.layers.0.attention_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WMlj6WM`ubj%W)”}”(j(WŒ2optimizer.param_groups.layers.0.ffn_norm.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WM`oj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.1.attention.wq.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WM ƒj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.1.attention.wq.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WM‡j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.1.attention.wk.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WM`Šj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.1.attention.wk.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WMÀj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.1.attention.wk.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WM ‘j6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.1.attention.wv.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WM€”j6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.1.attention.wv.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WMà—j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.1.attention.wo.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WMÀ¨j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.1.attention.wo.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WM ¬j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.1.feed_forward.w1.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WM€¯j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.1.feed_forward.w1.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WMà²j6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.1.feed_forward.w1.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WM@¶j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.1.feed_forward.w2.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WM ¹j6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.1.feed_forward.w2.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WM½j6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.1.feed_forward.w2.weight.maximize”j*WNubj1W)”}”(j4Wj¤aj+WM`Àj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.1.feed_forward.w3.weight.amsgrad”j*WNubj1W)”}”(j4Wj¤aj+WM€Êj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.1.feed_forward.w3.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WMàÍj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.1.feed_forward.w3.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WM@Ñj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.1.attention_norm.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WM Ôj6WM`ubj%W)”}”(j(WŒ=optimizer.param_groups.layers.1.attention_norm.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WMØj6WM`ubj%W)”}”(j(WŒDoptimizer.param_groups.layers.1.attention_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WM`Ûj6WM`ubj%W)”}”(j(WŒ2optimizer.param_groups.layers.1.ffn_norm.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WMÀÞj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.2.attention.wq.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WMój6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.2.attention.wq.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WM`öj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.2.attention.wk.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WMÀùj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.2.attention.wk.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WM ýj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.2.attention.wk.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJ€j6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.2.attention.wv.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJàj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.2.attention.wv.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WJ@j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.2.attention.wo.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJ j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.2.attention.wo.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJ€j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.2.feed_forward.w1.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJàj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.2.feed_forward.w1.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJ@"j6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.2.feed_forward.w1.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJ %j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.2.feed_forward.w2.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJ)j6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.2.feed_forward.w2.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WJ`,j6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.2.feed_forward.w2.weight.maximize”j*WNubj1W)”}”(j4Wj¤aj+WJÀ/j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.2.feed_forward.w3.weight.amsgrad”j*WNubj1W)”}”(j4Wj¤aj+WJà9j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.2.feed_forward.w3.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJ@=j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.2.feed_forward.w3.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJ @j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.2.attention_norm.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJDj6WM`ubj%W)”}”(j(WŒ=optimizer.param_groups.layers.2.attention_norm.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJ`Gj6WM`ubj%W)”}”(j(WŒDoptimizer.param_groups.layers.2.attention_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJÀJj6WM`ubj%W)”}”(j(WŒ2optimizer.param_groups.layers.2.ffn_norm.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJ Nj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.3.attention.wq.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJ`bj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.3.attention.wq.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJÀej6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.3.attention.wk.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJ ij6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.3.attention.wk.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJ€lj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.3.attention.wk.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJàoj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.3.attention.wv.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJ@sj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.3.attention.wv.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WJ vj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.3.attention.wo.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJ€‡j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.3.attention.wo.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJàŠj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.3.feed_forward.w1.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJ@Žj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.3.feed_forward.w1.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJ ‘j6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.3.feed_forward.w1.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJ•j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.3.feed_forward.w2.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJ`˜j6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.3.feed_forward.w2.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WJÀ›j6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.3.feed_forward.w2.weight.maximize”j*WNubj1W)”}”(j4Wj¤aj+WJ Ÿj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.3.feed_forward.w3.weight.amsgrad”j*WNubj1W)”}”(j4Wj¤aj+WJ@©j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.3.feed_forward.w3.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJ ¬j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.3.feed_forward.w3.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJ°j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.3.attention_norm.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJ`³j6WM`ubj%W)”}”(j(WŒ=optimizer.param_groups.layers.3.attention_norm.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJÀ¶j6WM`ubj%W)”}”(j(WŒDoptimizer.param_groups.layers.3.attention_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJ ºj6WM`ubj%W)”}”(j(WŒ2optimizer.param_groups.layers.3.ffn_norm.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJ€½j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.4.attention.wq.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJÀÑj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.4.attention.wq.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJ Õj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.4.attention.wk.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJ€Øj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.4.attention.wk.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJàÛj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.4.attention.wk.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJ@ßj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.4.attention.wv.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJ âj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.4.attention.wv.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WJæj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.4.attention.wo.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJàöj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.4.attention.wo.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJ@új6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.4.feed_forward.w1.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJ ýj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.4.feed_forward.w1.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJj6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.4.feed_forward.w1.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJ`j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.4.feed_forward.w2.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJÀj6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.4.feed_forward.w2.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WJ j6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.4.feed_forward.w2.weight.maximize”j*WNubj1W)”}”(j4Wj¤aj+WJ€j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.4.feed_forward.w3.weight.amsgrad”j*WNubj1W)”}”(j4Wj¤aj+WJ j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.4.feed_forward.w3.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.4.feed_forward.w3.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJ`j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.4.attention_norm.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJÀ"j6WM`ubj%W)”}”(j(WŒ=optimizer.param_groups.layers.4.attention_norm.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJ &j6WM`ubj%W)”}”(j(WŒDoptimizer.param_groups.layers.4.attention_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJ€)j6WM`ubj%W)”}”(j(WŒ2optimizer.param_groups.layers.4.ffn_norm.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJà,j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.5.attention.wq.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJ Aj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.5.attention.wq.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJ€Dj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.5.attention.wk.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJàGj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.5.attention.wk.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJ@Kj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.5.attention.wk.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJ Nj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.5.attention.wv.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJRj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.5.attention.wv.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WJ`Uj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.5.attention.wo.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJ@fj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.5.attention.wo.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJ ij6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.5.feed_forward.w1.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJmj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.5.feed_forward.w1.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJ`pj6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.5.feed_forward.w1.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJÀsj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.5.feed_forward.w2.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJ wj6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.5.feed_forward.w2.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WJ€zj6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.5.feed_forward.w2.weight.maximize”j*WNubj1W)”}”(j4Wj¤aj+WJà}j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.5.feed_forward.w3.weight.amsgrad”j*WNubj1W)”}”(j4Wj¤aj+WJˆj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.5.feed_forward.w3.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJ`‹j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.5.feed_forward.w3.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJÀŽj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.5.attention_norm.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJ ’j6WM`ubj%W)”}”(j(WŒ=optimizer.param_groups.layers.5.attention_norm.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJ€•j6WM`ubj%W)”}”(j(WŒDoptimizer.param_groups.layers.5.attention_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJà˜j6WM`ubj%W)”}”(j(WŒ2optimizer.param_groups.layers.5.ffn_norm.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJ@œj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.6.attention.wq.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJ€°j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.6.attention.wq.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJà³j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.6.attention.wk.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJ@·j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.6.attention.wk.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJ ºj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.6.attention.wk.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJ¾j6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.6.attention.wv.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJ`Áj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.6.attention.wv.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WJÀÄj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.6.attention.wo.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJ Õj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.6.attention.wo.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJÙj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.6.feed_forward.w1.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJ`Üj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.6.feed_forward.w1.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJÀßj6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.6.feed_forward.w1.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJ ãj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.6.feed_forward.w2.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJ€æj6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.6.feed_forward.w2.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WJàéj6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.6.feed_forward.w2.weight.maximize”j*WNubj1W)”}”(j4Wj¤aj+WJ@íj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.6.feed_forward.w3.weight.amsgrad”j*WNubj1W)”}”(j4Wj¤aj+WJ`÷j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.6.feed_forward.w3.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJÀúj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.6.feed_forward.w3.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJ þj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.6.attention_norm.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJ€j6WM`ubj%W)”}”(j(WŒ=optimizer.param_groups.layers.6.attention_norm.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJàj6WM`ubj%W)”}”(j(WŒDoptimizer.param_groups.layers.6.attention_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJ@j6WM`ubj%W)”}”(j(WŒ2optimizer.param_groups.layers.6.ffn_norm.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJ  j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.7.attention.wq.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJàj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.7.attention.wq.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJ@#j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.7.attention.wk.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJ &j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.7.attention.wk.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJ*j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.7.attention.wk.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJ`-j6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.7.attention.wv.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJÀ0j6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.7.attention.wv.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WJ 4j6WM`ubj%W)”}”(j(WŒj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.7.attention.wo.weight.amsgrad”j*WNubj1W)”}”(j4Wj¤aj+WJ Aj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.7.attention.wo.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJEj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.7.attention.wo.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJ`Hj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.7.feed_forward.w1.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJÀKj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.7.feed_forward.w1.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJ Oj6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.7.feed_forward.w1.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJ€Rj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.7.feed_forward.w2.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJàUj6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.7.feed_forward.w2.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WJ@Yj6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.7.feed_forward.w2.weight.maximize”j*WNubj1W)”}”(j4Wj¤aj+WJ \j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.7.feed_forward.w3.weight.amsgrad”j*WNubj1W)”}”(j4Wj¤aj+WJÀfj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.7.feed_forward.w3.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJ jj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.7.feed_forward.w3.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJ€mj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.7.attention_norm.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJàpj6WM`ubj%W)”}”(j(WŒ=optimizer.param_groups.layers.7.attention_norm.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJ@tj6WM`ubj%W)”}”(j(WŒDoptimizer.param_groups.layers.7.attention_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJ wj6WM`ubj%W)”}”(j(WŒ2optimizer.param_groups.layers.7.ffn_norm.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJ{j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.8.attention.wq.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJ@j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.8.attention.wq.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJ ’j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.8.attention.wk.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJ–j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.8.attention.wk.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJ`™j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.8.attention.wk.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJÀœj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.8.attention.wv.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJ  j6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.8.attention.wv.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WJ€£j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.8.attention.wo.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJ`´j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.8.attention.wo.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJÀ·j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.8.feed_forward.w1.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJ »j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.8.feed_forward.w1.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJ€¾j6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.8.feed_forward.w1.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJàÁj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.8.feed_forward.w2.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJ@Åj6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.8.feed_forward.w2.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WJ Èj6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.8.feed_forward.w2.weight.maximize”j*WNubj1W)”}”(j4Wj¤aj+WJÌj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.8.feed_forward.w3.weight.amsgrad”j*WNubj1W)”}”(j4Wj¤aj+WJ Öj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.8.feed_forward.w3.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJ€Ùj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.8.feed_forward.w3.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJàÜj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.8.attention_norm.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJ@àj6WM`ubj%W)”}”(j(WŒ=optimizer.param_groups.layers.8.attention_norm.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJ ãj6WM`ubj%W)”}”(j(WŒDoptimizer.param_groups.layers.8.attention_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJçj6WM`ubj%W)”}”(j(WŒ2optimizer.param_groups.layers.8.ffn_norm.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJ`êj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.9.attention.wq.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJ þj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.9.attention.wq.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.9.attention.wk.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJ`j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.9.attention.wk.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJÀj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.9.attention.wk.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJ j6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.9.attention.wv.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJ€j6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.9.attention.wv.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WJàj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.9.attention.wo.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJÀ#j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.9.attention.wo.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJ 'j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.9.feed_forward.w1.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJ€*j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.9.feed_forward.w1.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJà-j6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.9.feed_forward.w1.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJ@1j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.9.feed_forward.w2.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJ 4j6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.9.feed_forward.w2.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WJ8j6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.9.feed_forward.w2.weight.maximize”j*WNubj1W)”}”(j4Wj¤aj+WJ`;j6WM`ubj%W)”}”(j(WŒj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.9.feed_forward.w3.weight.amsgrad”j*WNubj1W)”}”(j4Wj¤aj+WJ€Ej6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.9.feed_forward.w3.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJàHj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.9.feed_forward.w3.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJ@Lj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.9.attention_norm.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJ Oj6WM`ubj%W)”}”(j(WŒ=optimizer.param_groups.layers.9.attention_norm.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJSj6WM`ubj%W)”}”(j(WŒDoptimizer.param_groups.layers.9.attention_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJ`Vj6WM`ubj%W)”}”(j(WŒ2optimizer.param_groups.layers.9.ffn_norm.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJÀYj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.10.attention_norm.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJ`Âj6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.10.attention_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJÀÅj6WM`ubj%W)”}”(j(WŒ3optimizer.param_groups.layers.10.ffn_norm.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJ Éj6WM`ubj%W)”}”(j(WŒ=optimizer.param_groups.layers.10.ffn_norm.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WJ€Ìj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.10.ffn_norm.weight.maximize”j*WNubj1W)”}”(j4Wj¤aj+WJàÏj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.10.ffn_norm.weight.fused”j*WNubj1W)”}”(j4Wj¤aj+WJ@Ój6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.11.attention.wq.weight.betas”j*WNubj1W)”}”(j4Wj¤aj+WJ Öj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.11.attention_norm.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJÀ1j6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.11.attention_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJ 5j6WM`ubj%W)”}”(j(WŒ3optimizer.param_groups.layers.11.ffn_norm.weight.lr”j*WNubj1W)”}”(j4Wj¤aj+WJ€8j6WM`ubj%W)”}”(j(WŒ=optimizer.param_groups.layers.11.ffn_norm.weight.weight_decay”j*WNubj1W)”}”(j4Wj¤aj+WJà;j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.11.ffn_norm.weight.maximize”j*WNubj1W)”}”(j4Wj¤aj+WJ@?j6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.11.ffn_norm.weight.fused”j*WNubj1W)”}”(j4Wj¤aj+WJ Bj6WM`ubj%W)”}”(j(WŒ(optimizer.param_groups.norm.weight.betas”j*WNubj1W)”}”(j4Wj¤aj+WJFj6WM`ubj%W)”}”(j(WŒ*optimizer.param_groups.norm.weight.amsgrad”j*WNubj1W)”}”(j4Wj¤aj+WJ`Ij6WM`ubj%W)”}”(j(WŒ-optimizer.param_groups.norm.weight.capturable”j*WNubj1W)”}”(j4Wj¤aj+WJÀLj6WM`ubj%W)”}”(j(WŒ-optimizer.param_groups.norm.weight.initial_lr”j*WNubj1W)”}”(j4Wj¤aj+WJ Pj6WM`ubj%W)”}”(j(WŒ(optimizer.param_groups.output.weight.eps”j*WNubj1W)”}”(j4Wj¤aj+WJ€Sj6WM`ubj%W)”}”(j(WŒ,optimizer.param_groups.output.weight.foreach”j*WNubj1W)”}”(j4Wj¤aj+WJàVj6WM`ubj%W)”}”(j(WŒ3optimizer.param_groups.output.weight.differentiable”j*WNubj1W)”}”(j4Wj¤aj+WJ@Zj6WM`ubj%W)”}”(j(WŒscheduler.base_lrs”j*WNubj1W)”}”(j4Wj¤aj+WJ ]j6WM`ubj%W)”}”(j(WŒscheduler._step_count”j*WNubj1W)”}”(j4Wj¤aj+WJaj6WM`ubj%W)”}”(j(WŒtraining_progress.total_tokens”j*WNubj1W)”}”(j4Wj¤aj+WJ`dj6WM`ubj%W)”}”(j(WŒ*optimizer.state.tok_embeddings.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJÀgj6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.0.attention.wv.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJ\lj6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.0.feed_forward.w2.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJøpj6WMœubj%W)”}”(j(WŒ-optimizer.state.layers.0.ffn_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJ”uj6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.1.attention.wv.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJ0zj6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.1.feed_forward.w2.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJÌ~j6WMœubj%W)”}”(j(WŒ-optimizer.state.layers.1.ffn_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJhƒj6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.2.attention.wv.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJˆj6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.2.feed_forward.w2.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJ Œj6WMœubj%W)”}”(j(WŒ-optimizer.state.layers.2.ffn_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJ<‘j6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.3.attention.wv.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJØ•j6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.3.feed_forward.w2.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJtšj6WMœubj%W)”}”(j(WŒ-optimizer.state.layers.3.ffn_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJŸj6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.4.attention.wv.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJ¬£j6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.4.feed_forward.w2.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJH¨j6WMœubj%W)”}”(j(WŒ-optimizer.state.layers.4.ffn_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJä¬j6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.5.attention.wv.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJ€±j6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.5.feed_forward.w2.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJ¶j6WMœubj%W)”}”(j(WŒ-optimizer.state.layers.5.ffn_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJ¸ºj6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.6.attention.wv.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJT¿j6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.6.feed_forward.w2.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJðÃj6WMœubj%W)”}”(j(WŒ-optimizer.state.layers.6.ffn_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJŒÈj6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.7.attention.wv.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJ(Íj6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.7.feed_forward.w2.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJÄÑj6WMœubj%W)”}”(j(WŒ-optimizer.state.layers.7.ffn_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJ`Öj6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.8.attention.wv.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJüÚj6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.8.feed_forward.w2.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJ˜ßj6WMœubj%W)”}”(j(WŒ-optimizer.state.layers.8.ffn_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJ4äj6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.9.attention.wv.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJÐèj6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.9.feed_forward.w2.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJlíj6WMœubj%W)”}”(j(WŒ-optimizer.state.layers.9.ffn_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJòj6WMœubj%W)”}”(j(W•Œ2optimizer.state.layers.10.attention.wv.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJ¤öj6WMœubj%W)”}”(j(WŒ5optimizer.state.layers.10.feed_forward.w2.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJ@ûj6WMœubj%W)”}”(j(WŒ.optimizer.state.layers.10.ffn_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJÜÿj6WMœubj%W)”}”(j(WŒ2optimizer.state.layers.11.attention.wv.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJxj6WMœubj%W)”}”(j(WŒ5optimizer.state.layers.11.feed_forward.w2.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJ j6WMœubj%W)”}”(j(WŒ.optimizer.state.layers.11.ffn_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj¤aj+WJ° j6WMœubj%W)”}”(j(WŒ$model.layers.0.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJLj6WMœubj%W)”}”(j(WŒmodel.layers.0.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJèj6WMœubj%W)”}”(j(WŒ$model.layers.1.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJ„#j6WMœubj%W)”}”(j(WŒmodel.layers.1.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJ ,j6WMœubj%W)”}”(j(WŒ$model.layers.2.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJ¼4j6WMœubj%W)”}”(j(WŒmodel.layers.2.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJX=j6WMœubj%W)”}”(j(WŒ$model.layers.3.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJôEj6WMœubj%W)”}”(j(WŒmodel.layers.3.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJNj6WMœubj%W)”}”(j(WŒ$model.layers.4.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJ,Wj6WMœubj%W)”}”(j(WŒmodel.layers.4.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJÈ_j6WMœubj%W)”}”(j(WŒ$model.layers.5.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJdhj6WMœubj%W)”}”(j(WŒmodel.layers.5.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJqj6WMœubj%W)”}”(j(WŒ$model.layers.6.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJœyj6WMœubj%W)”}”(j(WŒmodel.layers.6.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJ8‚j6WMœubj%W)”}”(j(WŒ$model.layers.7.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJÔŠj6WMœubj%W)”}”(j(WŒmodel.layers.7.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJp“j6WMœubj%W)”}”(j(WŒ$model.layers.8.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJ œj6WMœubj%W)”}”(j(WŒmodel.layers.8.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJ¨¤j6WMœubj%W)”}”(j(WŒ$model.layers.9.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJD­j6WMœubj%W)”}”(j(WŒmodel.layers.9.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJàµj6WMœubj%W)”}”(j(WŒ%model.layers.10.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJ|¾j6WMœubj%W)”}”(j(WŒmodel.layers.10.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJÇj6WMœubj%W)”}”(j(WŒ%model.layers.11.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJ´Ïj6WMœubj%W)”}”(j(WŒmodel.layers.11.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJPØj6WMœubj%W)”}”(j(WŒmodel.norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJìàj6WMœubj%W)”}”(j(WŒ6optimizer.state.layers.0.attention_norm.weight.exp_avg”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJˆéj6WMœubj%W)”}”(j(WŒ9optimizer.state.layers.0.attention_norm.weight.exp_avg_sq”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJ$òj6WMœubj%W)”}”(j(WŒ0optimizer.state.layers.0.ffn_norm.weight.exp_avg”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJÀúj6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.0.ffn_norm.weight.exp_avg_sq”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJ\j6WMœubj%W)”}”(j(WŒ6optimizer.state.layers.1.attention_norm.weight.exp_avg”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJø j6WMœubj%W)”}”(j(WŒ9optimizer.state.layers.1.attention_norm.weight.exp_avg_sq”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJ”j6WMœubj%W)”}”(j(WŒ0optimizer.state.layers.1.ffn_norm.weight.exp_avg”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJ0j6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.1.ffn_norm.weight.exp_avg_sq”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJÌ%j6WMœubj%W)”}”(j(WŒ6optimizer.state.layers.2.attention_norm.weight.exp_avg”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJh.j6WMœubj%W)”}”(j(WŒ9optimizer.state.layers.2.attention_norm.weight.exp_avg_sq”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJ7j6WMœubj%W)”}”(j(WŒ0optimizer.state.layers.2.ffn_norm.weight.exp_avg”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJ ?j6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.2.ffn_norm.weight.exp_avg_sq”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj¤aj+WJoptimizer.param_groups.layers.0.attention.wv.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WMà+j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.0.attention.wv.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WM@/j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.0.attention.wo.weight.eps”j*WNubj1W)”}”(j4Wjuj+WM 2j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.0.attention.wo.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WM6j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.0.attention.wo.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WM`9j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.0.feed_forward.w1.weight.lr”j*WNubj1W)”}”(j4Wjuj+WMÀoptimizer.param_groups.layers.0.feed_forward.w2.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WM Mj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.0.feed_forward.w2.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WMQj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.0.feed_forward.w2.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WM`Tj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.0.feed_forward.w3.weight.eps”j*WNubj1W)”}”(j4Wjuj+WMÀWj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.0.feed_forward.w3.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WM [j6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.0.feed_forward.w3.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WM€^j6WM`ubj%W)”}”(j(WŒ8optimizer.param_groups.layers.0.attention_norm.weight.lr”j*WNubj1W)”}”(j4Wjuj+WMàaj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.0.attention_norm.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WM@ej6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.0.attention_norm.weight.maximize”j*WNubj1W)”}”(j4Wjuj+WM hj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.0.attention_norm.weight.fused”j*WNubj1W)”}”(j4Wjuj+WMlj6WM`ubj%W)”}”(j(WŒ5optimizer.param_groups.layers.0.ffn_norm.weight.betas”j*WNubj1W)”}”(j4Wjuj+WM`oj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.0.ffn_norm.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WMÀrj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.0.ffn_norm.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WM vj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.0.ffn_norm.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WM€yj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.1.attention.wq.weight.eps”j*WNubj1W)”}”(j4Wjuj+WMà|j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.1.attention.wq.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WM@€j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.1.attention.wq.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WM ƒj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.1.attention.wk.weight.lr”j*WNubj1W)”}”(j4Wjuj+WM‡j6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.1.attention.wk.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WM`Šj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.1.attention.wv.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WM@›j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.1.attention.wv.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WM žj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.1.attention.wo.weight.eps”j*WNubj1W)”}”(j4Wjuj+WM¢j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.1.attention.wo.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WM`¥j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.1.attention.wo.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WMÀ¨j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.1.feed_forward.w1.weight.lr”j*WNubj1W)”}”(j4Wjuj+WM ¬j6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.1.feed_forward.w1.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WM€¯j6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.1.feed_forward.w1.weight.maximize”j*WNubj1W)”}”(j4Wjuj+WMà²j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.1.feed_forward.w2.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WM½j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.1.feed_forward.w2.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WM`Àj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.1.feed_forward.w2.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WMÀÃj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.1.feed_forward.w3.weight.eps”j*WNubj1W)”}”(j4Wjuj+WM Çj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.1.feed_forward.w3.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WM€Êj6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.1.feed_forward.w3.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WMàÍj6WM`ubj%W)”}”(j(WŒ8optimizer.param_groups.layers.1.attention_norm.weight.lr”j*WNubj1W)”}”(j4Wjuj+WM@Ñj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.1.attention_norm.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WM Ôj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.1.attention_norm.weight.maximize”j*WNubj1W)”}”(j4Wjuj+WMØj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.1.attention_norm.weight.fused”j*WNubj1W)”}”(j4Wjuj+WM`Ûj6WM`ubj%W)”}”(j(WŒ5optimizer.param_groups.layers.1.ffn_norm.weight.betas”j*WNubj1W)”}”(j4Wjuj+WMÀÞj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.1.ffn_norm.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WM âj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.1.ffn_norm.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WM€åj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.1.ffn_norm.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WMàèj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.2.attention.wq.weight.eps”j*WNubj1W)”}”(j4Wjuj+WM@ìj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.2.attention.wq.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WM ïj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.2.attention.wq.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WMój6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.2.attention.wk.weight.lr”j*WNubj1W)”}”(j4Wjuj+WM`öj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.2.attention.wk.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WMÀùj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.2.attention.wv.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJ  j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.2.attention.wv.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.2.attention.wo.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJ`j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.2.attention.wo.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJÀj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.2.attention.wo.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJ j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.2.feed_forward.w1.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJ€j6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.2.feed_forward.w1.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJàj6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.2.feed_forward.w1.weight.maximize”j*WNubj1W)”}”(j4Wjuj+WJ@"j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.2.feed_forward.w2.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WJ`,j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.2.feed_forward.w2.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJÀ/j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.2.feed_forward.w2.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJ 3j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.2.feed_forward.w3.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJ€6j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.2.feed_forward.w3.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJà9j6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.2.feed_forward.w3.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJ@=j6WM`ubj%W)”}”(j(WŒ8optimizer.param_groups.layers.2.attention_norm.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJ @j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.2.attention_norm.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJDj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.2.attention_norm.weight.maximize”j*WNubj1W)”}”(j4Wjuj+WJ`Gj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.2.attention_norm.weight.fused”j*WNubj1W)”}”(j4Wjuj+WJÀJj6WM`ubj%W)”}”(j(WŒ5optimizer.param_groups.layers.2.ffn_norm.weight.betas”j*WNubj1W)”}”(j4Wjuj+WJ Nj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.2.ffn_norm.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WJ€Qj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.2.ffn_norm.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJàTj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.2.ffn_norm.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJ@Xj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.3.attention.wq.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJ [j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.3.attention.wq.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJ_j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.3.attention.wq.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJ`bj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.3.attention.wk.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJÀej6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.3.attention.wk.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJ ij6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.3.attention.wv.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJzj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.3.attention.wv.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJ`}j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.3.attention.wo.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJÀ€j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.3.attention.wo.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJ „j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.3.attention.wo.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJ€‡j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.3.feed_forward.w1.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJàŠj6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.3.feed_forward.w1.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJ@Žj6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.3.feed_forward.w1.weight.maximize”j*WNubj1W)”}”(j4Wjuj+WJ ‘j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.3.feed_forward.w2.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WJÀ›j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.3.feed_forward.w2.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJ Ÿj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.3.feed_forward.w2.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJ€¢j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.3.feed_forward.w3.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJà¥j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.3.feed_forward.w3.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJ@©j6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.3.feed_forward.w3.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJ ¬j6WM`ubj%W)”}”(j(WŒ8optimizer.param_groups.layers.3.attention_norm.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJ°j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.3.attention_norm.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJ`³j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.3.attention_norm.weight.maximize”j*WNubj1W)”}”(j4Wjuj+WJÀ¶j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.3.attention_norm.weight.fused”j*WNubj1W)”}”(j4Wjuj+WJ ºj6WM`ubj%W)”}”(j(WŒ5optimizer.param_groups.layers.3.ffn_norm.weight.betas”j*WNubj1W)”}”(j4Wjuj+WJ€½j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.3.ffn_norm.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WJàÀj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.3.ffn_norm.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJ@Äj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.3.ffn_norm.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJ Çj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.4.attention.wq.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJËj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.4.attention.wq.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJ`Îj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.4.attention.wq.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJÀÑj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.4.attention.wk.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJ Õj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.4.attention.wk.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJ€Øj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.4.attention.wv.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJ`éj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.4.attention.wv.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJÀìj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.4.attention.wo.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJ ðj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.4.attention.wo.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJ€ój6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.4.attention.wo.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJàöj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.4.feed_forward.w1.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJ@új6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.4.feed_forward.w1.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJ ýj6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.4.feed_forward.w1.weight.maximize”j*WNubj1W)”}”(j4Wjuj+WJj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.4.feed_forward.w2.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WJ j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.4.feed_forward.w2.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJ€j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.4.feed_forward.w2.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJàj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.4.feed_forward.w3.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJ@j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.4.feed_forward.w3.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJ j6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.4.feed_forward.w3.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJj6WM`ubj%W)”}”(j(WŒ8optimizer.param_groups.layers.4.attention_norm.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJ`j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.4.attention_norm.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJÀ"j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.4.attention_norm.weight.maximize”j*WNubj1W)”}”(j4Wjuj+WJ &j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.4.attention_norm.weight.fused”j*WNubj1W)”}”(j4Wjuj+WJ€)j6WM`ubj%W)”}”(j(WŒ5optimizer.param_groups.layers.4.ffn_norm.weight.betas”j*WNubj1W)”}”(j4Wjuj+WJà,j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.4.ffn_norm.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WJ@0j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.4.ffn_norm.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJ 3j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.4.ffn_norm.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJ7j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.5.attention.wq.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJ`:j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.5.attention.wq.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJÀ=j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.5.attention.wq.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJ Aj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.5.attention.wk.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJ€Dj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.5.attention.wk.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJàGj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.5.attention.wv.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJÀXj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.5.attention.wv.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJ \j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.5.attention.wo.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJ€_j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.5.attention.wo.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJàbj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.5.attention.wo.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJ@fj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.5.feed_forward.w1.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJ ij6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.5.feed_forward.w1.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJmj6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.5.feed_forward.w1.weight.maximize”j*WNubj1W)”}”(j4Wjuj+WJ`pj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.5.feed_forward.w2.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WJ€zj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.5.feed_forward.w2.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJà}j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.5.feed_forward.w2.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJ@j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.5.feed_forward.w3.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJ „j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.5.feed_forward.w3.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJˆj6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.5.feed_forward.w3.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJ`‹j6WM`ubj%W)”}”(j(WŒ8optimizer.param_groups.layers.5.attention_norm.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJÀŽj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.5.attention_norm.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJ ’j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.5.attention_norm.weight.maximize”j*WNubj1W)”}”(j4Wjuj+WJ€•j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.5.attention_norm.weight.fused”j*WNubj1W)”}”(j4Wjuj+WJà˜j6WM`ubj%W)”}”(j(WŒ5optimizer.param_groups.layers.5.ffn_norm.weight.betas”j*WNubj1W)”}”(j4Wjuj+WJ@œj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.5.ffn_norm.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WJ Ÿj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.5.ffn_norm.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJ£j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.5.ffn_norm.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJ`¦j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.6.attention.wq.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJÀ©j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.6.attention.wq.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJ ­j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.6.attention.wq.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJ€°j6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.6.attention.wk.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJà³j6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.6.attention.wk.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJ@·j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.6.attention.wv.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJ Èj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.6.attention.wv.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJ€Ëj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.6.attention.wo.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJàÎj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.6.attention.wo.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJ@Òj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.6.attention.wo.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJ Õj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.6.feed_forward.w1.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJÙj6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.6.feed_forward.w1.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJ`Üj6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.6.feed_forward.w1.weight.maximize”j*WNubj1W)”}”(j4Wjuj+WJÀßj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.6.feed_forward.w2.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WJàéj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.6.feed_forward.w2.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJ@íj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.6.feed_forward.w2.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJ ðj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.6.feed_forward.w3.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJôj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.6.feed_forward.w3.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJ`÷j6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.6.feed_forward.w3.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJÀúj6WM`ubj%W)”}”(j(WŒ8optimizer.param_groups.layers.6.attention_norm.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJ þj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.6.attention_norm.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJ€j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.6.attention_norm.weight.maximize”j*WNubj1W)”}”(j4Wjuj+WJàj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.6.attention_norm.weight.fused”j*WNubj1W)”}”(j4Wjuj+WJ@j6WM`ubj%W)”}”(j(WŒ5optimizer.param_groups.layers.6.ffn_norm.weight.betas”j*WNubj1W)”}”(j4Wjuj+WJ  j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.6.ffn_norm.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WJj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.6.ffn_norm.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJ`j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.6.ffn_norm.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJÀj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.7.attention.wq.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJ j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.7.attention.wq.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJ€j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.7.attention.wq.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJàj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.7.attention.wk.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJ@#j6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.7.attention.wk.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJ &j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.7.attention.wv.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJ€7j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.7.attention.wv.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJà:j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.7.attention.wo.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJ@>j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.7.attention.wo.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJ Aj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.7.attention.wo.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJEj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.7.feed_forward.w1.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJ`Hj6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.7.feed_forward.w1.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJÀKj6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.7.feed_forward.w1.weight.maximize”j*WNubj1W)”}”(j4Wjuj+WJ Oj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.7.feed_forward.w2.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WJ@Yj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.7.feed_forward.w2.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJ \j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.7.feed_forward.w2.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJ`j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.7.feed_forward.w3.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJ`cj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.7.feed_forward.w3.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJÀfj6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.7.feed_forward.w3.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJ jj6WM`ubj%W)”}”(j(WŒ8optimizer.param_groups.layers.7.attention_norm.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJ€mj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.7.attention_norm.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJàpj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.7.attention_norm.weight.maximize”j*WNubj1W)”}”(j4Wjuj+WJ@tj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.7.attention_norm.weight.fused”j*WNubj1W)”}”(j4Wjuj+WJ wj6WM`ubj%W)”}”(j(WŒ5optimizer.param_groups.layers.7.ffn_norm.weight.betas”j*WNubj1W)”}”(j4Wjuj+WJ{j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.7.ffn_norm.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WJ`~j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.7.ffn_norm.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJÀj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.7.ffn_norm.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJ …j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.8.attention.wq.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJ€ˆj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.8.attention.wq.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJà‹j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.8.attention.wq.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJ@j6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.8.attention.wk.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJ ’j6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.8.attention.wk.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJ–j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.8.attention.wv.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJà¦j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.8.attention.wv.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJ@ªj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.8.attention.wo.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJ ­j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.8.attention.wo.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJ±j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.8.attention.wo.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJ`´j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.8.feed_forward.w1.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJÀ·j6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.8.feed_forward.w1.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJ »j6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.8.feed_forward.w1.weight.maximize”j*WNubj1W)”}”(j4Wjuj+WJ€¾j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.8.feed_forward.w2.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WJ Èj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.8.feed_forward.w2.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJÌj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.8.feed_forward.w2.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJ`Ïj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.8.feed_forward.w3.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJÀÒj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.8.feed_forward.w3.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJ Öj6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.8.feed_forward.w3.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJ€Ùj6WM`ubj%W)”}”(j(WŒ8optimizer.param_groups.layers.8.attention_norm.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJàÜj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.8.attention_norm.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJ@àj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.8.attention_norm.weight.maximize”j*WNubj1W)”}”(j4Wjuj+WJ ãj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.8.attention_norm.weight.fused”j*WNubj1W)”}”(j4Wjuj+WJçj6WM`ubj%W)”}”(j(WŒ5optimizer.param_groups.layers.8.ffn_norm.weight.betas”j*WNubj1W)”}”(j4Wjuj+WJ`êj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.8.ffn_norm.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WJÀíj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.8.ffn_norm.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJ ñj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.8.ffn_norm.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJ€ôj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.9.attention.wq.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJà÷j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.9.attention.wq.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJ@ûj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.9.attention.wq.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJ þj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.9.attention.wk.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.9.attention.wk.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJ`j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.9.attention.wv.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJ@j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.9.attention.wv.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJ j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.9.attention.wo.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.9.attention.wo.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJ` j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.9.attention.wo.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJÀ#j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.9.feed_forward.w1.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJ 'j6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.9.feed_forward.w1.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJ€*j6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.9.feed_forward.w1.weight.maximize”j*WNubj1W)”}”(j4Wjuj+WJà-j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.9.feed_forward.w2.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WJ8j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.9.feed_forward.w2.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJ`;j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.9.feed_forward.w2.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJÀ>j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.9.feed_forward.w3.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJ Bj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.9.feed_forward.w3.weight.foreach”j*WNubj1W)”}”(j4Wjuj+WJ€Ej6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.9.feed_forward.w3.weight.differentiable”j*WNubj1W)”}”(j4Wjuj+WJàHj6WM`ubj%W)”}”(j(WŒ8optimizer.param_groups.layers.9.attention_norm.weight.lr”j*WNubj1W)”}”(j4Wjuj+WJ@Lj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.9.attention_norm.weight.weight_decay”j*WNubj1W)”}”(j4Wjuj+WJ Oj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.9.attention_norm.weight.maximize”j*WNubj1W)”}”(j4Wjuj+WJSj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.9.attention_norm.weight.fused”j*WNubj1W)”}”(j4Wjuj+WJ`Vj6WM`ubj%W)”}”(j(WŒ5optimizer.param_groups.layers.9.ffn_norm.weight.betas”j*WNubj1W)”}”(j4Wjuj+WJÀYj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.9.ffn_norm.weight.amsgrad”j*WNubj1W)”}”(j4Wjuj+WJ ]j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.9.ffn_norm.weight.capturable”j*WNubj1W)”}”(j4Wjuj+WJ€`j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.9.ffn_norm.weight.initial_lr”j*WNubj1W)”}”(j4Wjuj+WJàcj6WM`ubj%W)”}”(j(WŒ8optimizer.param_groups.layers.10.attention.wq.weight.eps”j*WNubj1W)”}”(j4Wjuj+WJ@gj6WM`ubj%W)”}”(j(WŒK†”…”R”ubj1W)”}”(j4Wjuj+WJ!j6WJœôubj%W)”}”(j(WŒmodel.output.weight”j*WKj+Wh!M€>K†”…”R”ubj1W)”}”(j4Wjuj+WJ,&‘j6WJœôubj%W)”}”(j(WŒ-optimizer.state.tok_embeddings.weight.exp_avg”j*WKj+Wh!M€>K†”…”R”ubj1W)”}”(j4Wjuj+WJÈ*…j6WJœôubj%W)”}”(j(WŒ0optimizer.state.tok_embeddings.weight.exp_avg_sq”j*WKj+Wh!M€>K†”…”R”ubj1W)”}”(j4Wjuj+WJd/y!j6WJœôubj%W)”}”(j(WŒ%optimizer.state.output.weight.exp_avg”j*WKj+Wh!M€>K†”…”R”ubj1W)”}”(j4Wjuj+WJ4m#j6WJœôubj%W)”}”(j(WŒ(optimizer.state.output.weight.exp_avg_sq”j*WKj+Wh!M€>K†”…”R”ubj1W)”}”(j4Wjuj+WJœ8a%j6WJœôubj%W)”}”(j(WŒ0optimizer.param_groups.tok_embeddings.weight.eps”j*WNubj1W)”}”(j4WŒ __3_0.distcp”j+WKj6WM`ubj%W)”}”(j(WŒ4optimizer.param_groups.tok_embeddings.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WM`j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.tok_embeddings.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WMÀj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.0.attention.wq.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WM j6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.0.attention.wq.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WM€ j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.0.attention.wk.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WM`j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.0.attention.wk.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WMÀ!j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.0.attention.wv.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WM %j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.0.attention.wv.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WM€(j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.0.attention.wv.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WMà+j6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.0.attention.wo.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WM@/j6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.0.attention.wo.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WM 2j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.0.feed_forward.w1.weight.amsgrad”j*WNubj1W)”}”(j4Wj€ˆj+WM @j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.0.feed_forward.w1.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WM€Cj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.0.feed_forward.w1.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WMàFj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.0.feed_forward.w2.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WM@Jj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.0.feed_forward.w2.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WM Mj6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.0.feed_forward.w2.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WMQj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.0.feed_forward.w3.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WM`Tj6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.0.feed_forward.w3.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WMÀWj6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.0.feed_forward.w3.weight.maximize”j*WNubj1W)”}”(j4Wj€ˆj+WM [j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.0.ffn_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WM vj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.1.attention.wq.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WM€yj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.1.attention.wq.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WMà|j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.1.attention.wk.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WMÀj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.1.attention.wk.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WM ‘j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.1.attention.wv.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WM€”j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.1.attention.wv.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WMà—j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.1.attention.wv.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WM@›j6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.1.attention.wo.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WM žj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.1.attention.wo.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WM¢j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.1.feed_forward.w1.weight.amsgrad”j*WNubj1W)”}”(j4Wj€ˆj+WM€¯j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.1.feed_forward.w1.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WMà²j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.1.feed_forward.w1.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WM@¶j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.1.feed_forward.w2.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WM ¹j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.1.feed_forward.w2.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WM½j6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.1.feed_forward.w2.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WM`Àj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.1.feed_forward.w3.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WMÀÃj6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.1.feed_forward.w3.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WM Çj6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.1.feed_forward.w3.weight.maximize”j*WNubj1W)”}”(j4Wj€ˆj+WM€Êj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.1.ffn_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WM€åj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.2.attention.wq.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WMàèj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.2.attention.wq.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WM@ìj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.2.attention.wk.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WM ýj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.2.attention.wk.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ€j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.2.attention.wv.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WJàj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.2.attention.wv.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WJ@j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.2.attention.wv.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJ  j6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.2.attention.wo.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.2.attention.wo.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJ`j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.2.feed_forward.w1.weight.amsgrad”j*WNubj1W)”}”(j4Wj€ˆj+WJàj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.2.feed_forward.w1.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WJ@"j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.2.feed_forward.w1.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ %j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.2.feed_forward.w2.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WJ)j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.2.feed_forward.w2.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WJ`,j6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.2.feed_forward.w2.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJÀ/j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.2.feed_forward.w3.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ 3j6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.2.feed_forward.w3.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJ€6j6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.2.feed_forward.w3.weight.maximize”j*WNubj1W)”}”(j4Wj€ˆj+WJà9j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.2.ffn_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJàTj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.3.attention.wq.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ@Xj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.3.attention.wq.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJ [j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.3.attention.wk.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WJ€lj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.3.attention.wk.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJàoj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.3.attention.wv.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WJ@sj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.3.attention.wv.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WJ vj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.3.attention.wv.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJzj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.3.attention.wo.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ`}j6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.3.attention.wo.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJÀ€j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.3.feed_forward.w1.weight.amsgrad”j*WNubj1W)”}”(j4Wj€ˆj+WJ@Žj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.3.feed_forward.w1.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WJ ‘j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.3.feed_forward.w1.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ•j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.3.feed_forward.w2.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WJ`˜j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.3.feed_forward.w2.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WJÀ›j6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.3.feed_forward.w2.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJ Ÿj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.3.feed_forward.w3.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ€¢j6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.3.feed_forward.w3.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJà¥j6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.3.feed_forward.w3.weight.maximize”j*WNubj1W)”}”(j4Wj€ˆj+WJ@©j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.3.ffn_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJ@Äj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.4.attention.wq.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ Çj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.4.attention.wq.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJËj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.4.attention.wk.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WJàÛj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.4.attention.wk.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ@ßj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.4.attention.wv.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WJ âj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.4.attention.wv.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WJæj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.4.attention.wv.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJ`éj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.4.attention.wo.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJÀìj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.4.attention.wo.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJ ðj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.4.feed_forward.w1.weight.amsgrad”j*WNubj1W)”}”(j4Wj€ˆj+WJ ýj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.4.feed_forward.w1.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WJj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.4.feed_forward.w1.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ`j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.4.feed_forward.w2.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WJÀj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.4.feed_forward.w2.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WJ j6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.4.feed_forward.w2.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJ€j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.4.feed_forward.w3.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJàj6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.4.feed_forward.w3.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJ@j6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.4.feed_forward.w3.weight.maximize”j*WNubj1W)”}”(j4Wj€ˆj+WJ j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.4.ffn_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJ 3j6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.5.attention.wq.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ7j6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.5.attention.wq.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJ`:j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.5.attention.wk.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WJ@Kj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.5.attention.wk.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ Nj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.5.attention.wv.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WJRj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.5.attention.wv.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WJ`Uj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.5.attention.wv.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJÀXj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.5.attention.wo.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ \j6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.5.attention.wo.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJ€_j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.5.feed_forward.w1.weight.amsgrad”j*WNubj1W)”}”(j4Wj€ˆj+WJmj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.5.feed_forward.w1.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WJ`pj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.5.feed_forward.w1.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJÀsj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.5.feed_forward.w2.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WJ wj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.5.feed_forward.w2.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WJ€zj6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.5.feed_forward.w2.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJà}j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.5.feed_forward.w3.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ@j6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.5.feed_forward.w3.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJ „j6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.5.feed_forward.w3.weight.maximize”j*WNubj1W)”}”(j4Wj€ˆj+WJˆj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.5.ffn_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJ£j6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.6.attention.wq.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ`¦j6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.6.attention.wq.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJÀ©j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.6.attention.wk.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WJ ºj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.6.attention.wk.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ¾j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.6.attention.wv.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WJ`Áj6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.6.attention.wv.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WJÀÄj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.6.attention.wv.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJ Èj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.6.attention.wo.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ€Ëj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.6.attention.wo.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJàÎj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.6.feed_forward.w1.weight.amsgrad”j*WNubj1W)”}”(j4Wj€ˆj+WJ`Üj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.6.feed_forward.w1.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WJÀßj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.6.feed_forward.w1.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ ãj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.6.feed_forward.w2.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WJ€æj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.6.feed_forward.w2.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WJàéj6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.6.feed_forward.w2.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJ@íj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.6.feed_forward.w3.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ ðj6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.6.feed_forward.w3.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJôj6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.6.feed_forward.w3.weight.maximize”j*WNubj1W)”}”(j4Wj€ˆj+WJ`÷j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.6.ffn_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJ`j6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.7.attention.wq.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJÀj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.7.attention.wq.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJ j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.7.attention.wk.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WJ*j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.7.attention.wk.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ`-j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.7.attention.wv.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WJÀ0j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.7.attention.wv.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WJ 4j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.7.attention.wv.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJ€7j6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.7.attention.wo.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJà:j6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.7.attention.wo.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJ@>j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.7.feed_forward.w1.weight.amsgrad”j*WNubj1W)”}”(j4Wj€ˆj+WJÀKj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.7.feed_forward.w1.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WJ Oj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.7.feed_forward.w1.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ€Rj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.7.feed_forward.w2.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WJàUj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.7.feed_forward.w2.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WJ@Yj6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.7.feed_forward.w2.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJ \j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.7.feed_forward.w3.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ`j6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.7.feed_forward.w3.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJ`cj6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.7.feed_forward.w3.weight.maximize”j*WNubj1W)”}”(j4Wj€ˆj+WJÀfj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.7.ffn_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJÀj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.8.attention.wq.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ …j6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.8.attention.wq.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJ€ˆj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.8.attention.wk.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WJ`™j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.8.attention.wk.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJÀœj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.8.attention.wv.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WJ  j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.8.attention.wv.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WJ€£j6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.8.attention.wv.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJà¦j6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.8.attention.wo.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ@ªj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.8.attention.wo.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJ ­j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.8.feed_forward.w1.weight.amsgrad”j*WNubj1W)”}”(j4Wj€ˆj+WJ »j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.8.feed_forward.w1.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WJ€¾j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.8.feed_forward.w1.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJàÁj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.8.feed_forward.w2.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WJ@Åj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.8.feed_forward.w2.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WJ Èj6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.8.feed_forward.w2.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJÌj6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.8.feed_forward.w3.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ`Ïj6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.8.feed_forward.w3.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJÀÒj6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.8.feed_forward.w3.weight.maximize”j*WNubj1W)”}”(j4Wj€ˆj+WJ Öj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.8.ffn_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJ ñj6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.9.attention.wq.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ€ôj6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.9.attention.wq.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJà÷j6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.9.attention.wk.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WJÀj6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.9.attention.wk.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.9.attention.wv.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WJ€j6WM`ubj%W)”}”(j(WŒ;optimizer.param_groups.layers.9.attention.wv.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WJàj6WM`ubj%W)”}”(j(WŒBoptimizer.param_groups.layers.9.attention.wv.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJ@j6WM`ubj%W)”}”(j(WŒ6optimizer.param_groups.layers.9.attention.wo.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ j6WM`ubj%W)”}”(j(WŒ@optimizer.param_groups.layers.9.attention.wo.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.9.feed_forward.w1.weight.amsgrad”j*WNubj1W)”}”(j4Wj€ˆj+WJ€*j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.9.feed_forward.w1.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WJà-j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.9.feed_forward.w1.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ@1j6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.9.feed_forward.w2.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WJ 4j6WM`ubj%W)”}”(j(WŒ>optimizer.param_groups.layers.9.feed_forward.w2.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WJ8j6WM`ubj%W)”}”(j(WŒEoptimizer.param_groups.layers.9.feed_forward.w2.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJ`;j6WM`ubj%W)”}”(j(WŒ9optimizer.param_groups.layers.9.feed_forward.w3.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJÀ>j6WM`ubj%W)”}”(j(WŒCoptimizer.param_groups.layers.9.feed_forward.w3.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJ Bj6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.9.feed_forward.w3.weight.maximize”j*WNubj1W)”}”(j4Wj€ˆj+WJ€Ej6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.9.ffn_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJ€`j6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.10.attention.wq.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJàcj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.10.attention.wq.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJ@gj6WM`ubj%W)”}”(j(WŒ=optimizer.param_groups.layers.10.attention.wq.weight.maximize”j*WNubj1W)”}”(j4Wj€ˆj+WJ jj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.10.attention.wq.weight.fused”j*WNubj1W)”}”(j4Wj€ˆj+WJnj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.10.attention.wk.weight.betas”j*WNubj1W)”}”(j4Wj€ˆj+WJ`qj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.10.attention_norm.weight.amsgrad”j*WNubj1W)”}”(j4Wj€ˆj+WJ¿j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.10.attention_norm.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WJ`Âj6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.10.attention_norm.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJÀÅj6WM`ubj%W)”}”(j(WŒ4optimizer.param_groups.layers.10.ffn_norm.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WJ Éj6WM`ubj%W)”}”(j(WŒ8optimizer.param_groups.layers.10.ffn_norm.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WJ€Ìj6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.10.ffn_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJàÏj6WM`ubj%W)”}”(j(WŒ7optimizer.param_groups.layers.11.attention.wq.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ@Ój6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.11.attention.wq.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJ Öj6WM`ubj%W)”}”(j(WŒ=optimizer.param_groups.layers.11.attention.wq.weight.maximize”j*WNubj1W)”}”(j4Wj€ˆj+WJÚj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.11.attention.wq.weight.fused”j*WNubj1W)”}”(j4Wj€ˆj+WJ`Ýj6WM`ubj%W)”}”(j(WŒ:optimizer.param_groups.layers.11.attention.wk.weight.betas”j*WNubj1W)”}”(j4Wj€ˆj+WJÀàj6WM`ubj%W)”}”(j(WŒoptimizer.param_groups.layers.11.attention_norm.weight.amsgrad”j*WNubj1W)”}”(j4Wj€ˆj+WJ`.j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.11.attention_norm.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WJÀ1j6WM`ubj%W)”}”(j(WŒAoptimizer.param_groups.layers.11.attention_norm.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ 5j6WM`ubj%W)”}”(j(WŒ4optimizer.param_groups.layers.11.ffn_norm.weight.eps”j*WNubj1W)”}”(j4Wj€ˆj+WJ€8j6WM`ubj%W)”}”(j(WŒ8optimizer.param_groups.layers.11.ffn_norm.weight.foreach”j*WNubj1W)”}”(j4Wj€ˆj+WJà;j6WM`ubj%W)”}”(j(WŒ?optimizer.param_groups.layers.11.ffn_norm.weight.differentiable”j*WNubj1W)”}”(j4Wj€ˆj+WJ@?j6WM`ubj%W)”}”(j(WŒ%optimizer.param_groups.norm.weight.lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ Bj6WM`ubj%W)”}”(j(WŒ/optimizer.param_groups.norm.weight.weight_decay”j*WNubj1W)”}”(j4Wj€ˆj+WJFj6WM`ubj%W)”}”(j(WŒ+optimizer.param_groups.norm.weight.maximize”j*WNubj1W)”}”(j4Wj€ˆj+WJ`Ij6WM`ubj%W)”}”(j(WŒ(optimizer.param_groups.norm.weight.fused”j*WNubj1W)”}”(j4Wj€ˆj+WJÀLj6WM`ubj%W)”}”(j(WŒ*optimizer.param_groups.output.weight.betas”j*WNubj1W)”}”(j4Wj€ˆj+WJ Pj6WM`ubj%W)”}”(j(WŒ,optimizer.param_groups.output.weight.amsgrad”j*WNubj1W)”}”(j4Wj€ˆj+WJ€Sj6WM`ubj%W)”}”(j(WŒ/optimizer.param_groups.output.weight.capturable”j*WNubj1W)”}”(j4Wj€ˆj+WJàVj6WM`ubj%W)”}”(j(WŒ/optimizer.param_groups.output.weight.initial_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJ@Zj6WM`ubj%W)”}”(j(WŒscheduler.verbose”j*WNubj1W)”}”(j4Wj€ˆj+WJ ]j6WM`ubj%W)”}”(j(WŒscheduler._last_lr”j*WNubj1W)”}”(j4Wj€ˆj+WJaj6WM`ubj%W)”}”(j(WŒtraining_progress.step”j*WNubj1W)”}”(j4Wj€ˆj+WJ`dj6WM`ubj%W)”}”(j(WŒ1optimizer.state.layers.0.attention.wk.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJÀgj6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.0.feed_forward.w1.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJ\lj6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.0.attention_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJøpj6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.1.attention.wk.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJ”uj6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.1.feed_forward.w1.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJ0zj6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.1.attention_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJÌ~j6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.2.attention.wk.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJhƒj6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.2.feed_forward.w1.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJˆj6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.2.attention_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJ Œj6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.3.attention.wk.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJ<‘j6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.3.feed_forward.w1.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJØ•j6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.3.attention_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJtšj6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.4.attention.wk.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJŸj6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.4.feed_forward.w1.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJ¬£j6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.4.attention_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJH¨j6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.5.attention.wk.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJä¬j6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.5.feed_forward.w1.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJ€±j6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.5.attention_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJ¶j6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.6.attention.wk.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJ¸ºj6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.6.feed_forward.w1.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJT¿j6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.6.attention_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJðÃj6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.7.attention.wk.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJŒÈj6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.7.feed_forward.w1.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJ(Íj6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.7.attention_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJÄÑj6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.8.attention.wk.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJ`Öj6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.8.feed_forward.w1.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJüÚj6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.8.attention_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJ˜ßj6WMœubj%W)”}”(j(WŒ1optimizer.state.layers.9.attention.wk.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJ4äj6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.9.feed_forward.w1.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJÐèj6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.9.attention_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJlíj6WMœubj%W)”}”(j(WŒ2optimizer.state.layers.10.attention.wk.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJòj6WMœubj%W)”}”(j(WŒ5optimizer.state.layers.10.feed_forward.w1.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJ¤öj6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.10.attention_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJ@ûj6WMœubj%W)”}”(j(WŒ2optimizer.state.layers.11.attention.wk.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJÜÿj6WMœubj%W)”}”(j(WŒ5optimizer.state.layers.11.feed_forward.w1.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJxj6WMœubj%W)”}”(j(WŒ4optimizer.state.layers.11.attention_norm.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJ j6WMœubj%W)”}”(j(WŒ"optimizer.state.output.weight.step”j*WKj+Wh!)…”R”ubj1W)”}”(j4Wj€ˆj+WJ° j6WMœubj%W)”}”(j(WŒ$model.layers.0.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJLj6WMœubj%W)”}”(j(WŒmodel.layers.0.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJèj6WMœubj%W)”}”(j(WŒ$model.layers.1.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJ„#j6WMœubj%W)”}”(j(WŒmodel.layers.1.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJ ,j6WMœubj%W)”}”(j(WŒ$model.layers.2.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJ¼4j6WMœubj%W)”}”(j(WŒmodel.layers.2.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJX=j6WMœubj%W)”}”(j(WŒ$model.layers.3.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJôEj6WMœubj%W)”}”(j(WŒmodel.layers.3.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJNj6WMœubj%W)”}”(j(WŒ$model.layers.4.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJ,Wj6WMœubj%W)”}”(j(WŒmodel.layers.4.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJÈ_j6WMœubj%W)”}”(j(WŒ$model.layers.5.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJdhj6WMœubj%W)”}”(j(WŒmodel.layers.5.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJqj6WMœubj%W)”}”(j(WŒ$model.layers.6.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJœyj6WMœubj%W)”}”(j(WŒmodel.layers.6.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJ8‚j6WMœubj%W)”}”(j(WŒ$model.layers.7.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJÔŠj6WMœubj%W)”}”(j(WŒmodel.layers.7.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJp“j6WMœubj%W)”}”(j(WŒ$model.layers.8.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJ œj6WMœubj%W)”}”(j(WŒmodel.layers.8.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJ¨¤j6WMœubj%W)”}”(j(WŒ$model.layers.9.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJD­j6WMœubj%W)”}”(j(WŒmodel.layers.9.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJàµj6WMœubj%W)”}”(j(WŒ%model.layers.10.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJ|¾j6WMœubj%W)”}”(j(WŒmodel.layers.10.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJÇj6WMœubj%W)”}”(j(WŒ%model.layers.11.attention_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJ´Ïj6WMœubj%W)”}”(j(WŒmodel.layers.11.ffn_norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJPØj6WMœubj%W)”}”(j(WŒmodel.norm.weight”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJìàj6WMœubj%W)”}”(j(WŒ6optimizer.state.layers.0.attention_norm.weight.exp_avg”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJˆéj6WMœubj%W)”}”(j(WŒ9optimizer.state.layers.0.attention_norm.weight.exp_avg_sq”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJ$òj6WMœubj%W)”}”(j(WŒ0optimizer.state.layers.0.ffn_norm.weight.exp_avg”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJÀúj6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.0.ffn_norm.weight.exp_avg_sq”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJ\j6WMœubj%W)”}”(j(WŒ6optimizer.state.layers.1.attention_norm.weight.exp_avg”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJø j6WMœubj%W)”}”(j(WŒ9optimizer.state.layers.1.attention_norm.weight.exp_avg_sq”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJ”j6WMœubj%W)”}”(j(WŒ0optimizer.state.layers.1.ffn_norm.weight.exp_avg”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJ0j6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.1.ffn_norm.weight.exp_avg_sq”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJÌ%j6WMœubj%W)”}”(j(WŒ6optimizer.state.layers.2.attention_norm.weight.exp_avg”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJh.j6WMœubj%W)”}”(j(WŒ9optimizer.state.layers.2.attention_norm.weight.exp_avg_sq”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJ7j6WMœubj%W)”}”(j(WŒ0optimizer.state.layers.2.ffn_norm.weight.exp_avg”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJ ?j6WMœubj%W)”}”(j(WŒ3optimizer.state.layers.2.ffn_norm.weight.exp_avg_sq”j*WKj+Wh!M…”…”R”ubj1W)”}”(j4Wj€ˆj+WJ