- MLM Monolingual -
python examples/language-modeling/run_mlm.py --train_file=../../../../BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/wikitext-103-raw/wiki.valid.txt --output_dir=../../data/model_outputs/wikitext/debug --model_type=roberta --config_name=roberta-base --tokenizer_name=roberta-base --learning_rate 1e-4 --num_train_epochs 2 --warmup_steps 10000 --do_train --save_steps 10000 --per_device_train_batch_size 2 --overwrite_output_dir
- MLM vocabulary permute -
python -m pdb examples/language-modeling/run_mlm_synthetic.py --train_file=../../../../BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/wikitext-103-raw/wiki.valid.txt --validation_file=../../../../BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/wikitext-103-raw/wiki.valid.txt --output_dir=../../data/model_outputs/wikitext/debug --model_type=roberta --config_name=roberta-base --tokenizer_name=roberta-base --learning_rate 1e-4 --num_train_epochs 2 --warmup_steps 10000 --do_train --do_eval --save_steps 10000 --per_device_train_batch_size 2 --overwrite_output_dir --permute_vocabulary --vocab_permutation_file /n/fs/nlp-asd/asd/asd/Projects/Multilingual/Multilingual/synthetic_language_files/word_based/configuration_files/permuted_vocab_seed_42_size_50265.json --word_modification add
- Random word modification -
python -m pdb examples/language-modeling/run_mlm_synthetic.py --train_file=../../../../BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/wikitext-103-raw/wiki.valid.txt --validation_file=../../../../BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/wikitext-103-raw/wiki.valid.txt --output_dir=../../data/model_outputs/wikitext/debug --model_type=roberta --config_name=roberta-base --tokenizer_name=roberta-base --learning_rate 1e-4 --num_train_epochs 2 --warmup_steps 10000 --do_train --do_eval --save_steps 10000 --per_device_train_batch_size 2 --overwrite_output_dir --modify_words --word_modification add
- Inverting sentence -
python -m pdb examples/language-modeling/run_mlm_synthetic.py --train_file=../../../../BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/wikitext-103-raw/wiki.valid.txt --validation_file=../../../../BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/wikitext-103-raw/wiki.valid.txt --output_dir=../../data/model_outputs/wikitext/debug --model_type=roberta --config_name=roberta-base --tokenizer_name=roberta-base --learning_rate 1e-4 --num_train_epochs 2 --warmup_steps 10000 --do_train --do_eval --save_steps 10000 --per_device_train_batch_size 2 --overwrite_output_dir --invert_word_order --word_modification add
- Inverting sentence with cache -
python -m pdb examples/language-modeling/run_mlm_synthetic.py --train_file=../../../../BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/wikitext-103-raw/wiki.valid.txt --validation_file=../../../../BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/wikitext-103-raw/wiki.valid.txt --cache_dir /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/transformer_models --output_dir=../../data/model_outputs/wikitext/debug --model_type=roberta --config_name=roberta-base --tokenizer_name=roberta-base --learning_rate 1e-4 --num_train_epochs 2 --warmup_steps 10000 --do_train --do_eval --save_steps 10000 --per_device_train_batch_size 2 --overwrite_output_dir --invert_word_order --word_modification add
- One-to-one mapping for vocabulary -
python -m pdb examples/language-modeling/run_mlm_synthetic.py --train_file=../../../../BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/wikitext-103-raw/wiki.valid.txt --validation_file=../../../../BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/wikitext-103-raw/wiki.valid.txt --output_dir=../../data/model_outputs/wikitext/debug --model_type=roberta --config_name=roberta-base --tokenizer_name=roberta-base --learning_rate 1e-4 --num_train_epochs 2 --warmup_steps 10000 --do_train --do_eval --save_steps 10000 --per_device_train_batch_size 2 --overwrite_output_dir --one_to_one_mapping --shift_special --word_modification add
- One-to-one mapping for vocabulary (with file) -
python -m pdb examples/language-modeling/run_mlm_synthetic.py --train_file=../../../../BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/wikitext-103-raw/wiki.valid.txt --validation_file=../../../../BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/wikitext-103-raw/wiki.valid.txt --output_dir=../../data/model_outputs/wikitext/debug --model_type=roberta --config_name=roberta-base --tokenizer_name=roberta-base --learning_rate 1e-4 --num_train_epochs 2 --warmup_steps 10000 --do_train --do_eval --save_steps 10000 --per_device_train_batch_size 2 --overwrite_output_dir --one_to_one_mapping --shift_special --one_to_one_file ../synthetic_language_files/word_based/configuration_files/one_to_one_mapping_random_50265_fraction_70.npy --word_modification add
- Permutation language modeling -
python -m pdb examples/language-modeling/run_mlm_synthetic.py --train_file=../../../../BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/wikitext-103-raw/wiki.valid.txt --validation_file=../../../../BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/wikitext-103-raw/wiki.valid.txt --output_dir=../../data/model_outputs/wikitext/debug --model_type=roberta --config_name=roberta-base --tokenizer_name=roberta-base --learning_rate 1e-4 --num_train_epochs 2 --warmup_steps 10000 --do_train --do_eval --save_steps 10000 --per_device_train_batch_size 2 --overwrite_output_dir --permute_words --word_modification add
- Baseline -
python -m pdb run_ner_synthetic.py --model_name_or_path bert-base-uncased --train_file /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/multilingual_nlu/xtreme/panx_dataset/en/train.json --validation_file /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/multilingual_nlu/xtreme/panx_dataset/en/dev.json --output_dir ../../../../data/model_outputs/ner/debug --do_train --do_eval --cache_dir /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/transformer_models --overwrite_output_dir
- Word modification with random sampling -
python -m pdb run_ner_synthetic.py --model_name_or_path bert-base-uncased --train_file /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/multilingual_nlu/xtreme/panx_dataset/en/train.json --validation_file /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/multilingual_nlu/xtreme/panx_dataset/en/dev.json --output_dir ../../../../data/model_outputs/ner/debug --save_steps -1 --do_train --do_eval --cache_dir /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/transformer_models --overwrite_output_dir --modify_words --word_modification replace
- Inverted order - Might have to use the
--label_all_tokens
flag -python -m pdb run_ner_synthetic.py --model_name_or_path bert-base-uncased --train_file /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/multilingual_nlu/xtreme/panx_dataset/en/train.json --validation_file /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/multilingual_nlu/xtreme/panx_dataset/en/dev.json --output_dir ../../../../data/model_outputs/ner/debug --save_steps -1 --do_train --do_eval --cache_dir /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/transformer_models --overwrite_output_dir --invert_word_order --word_modification replace
- Baseline -
python -m pdb run_ner_synthetic.py --task_name pos --model_name_or_path bert-base-uncased --train_file /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/multilingual_nlu/xtreme/udpos/train-en.json --validation_file /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/multilingual_nlu/xtreme/udpos/dev-en.json --output_dir ../../../../data/model_outputs/pos/debug --do_train --do_eval --cache_dir /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/transformer_models --overwrite_output_dir
- Inverted order -
python -m pdb run_ner_synthetic.py --task_name pos --model_name_or_path bert-base-uncased --train_file /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/multilingual_nlu/xtreme/udpos/train-en.json --validation_file /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/multilingual_nlu/xtreme/udpos/dev-en.json --output_dir ../../../../data/model_outputs/pos/debug --do_train --do_eval --cache_dir /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/transformer_models --overwrite_output_dir --invert_word_order --word_modification replace
- Permutation language modeling -
python -m pdb run_ner_synthetic.py --task_name pos --model_name_or_path bert-base-uncased --train_file /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/multilingual_nlu/xtreme/udpos/dev-en.json --validation_file /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/multilingual_nlu/xtreme/udpos/dev-en.json --output_dir ../../../../data/model_outputs/pos/debug --do_train --do_eval --cache_dir /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/transformer_models --overwrite_output_dir --permute_words --word_modification replace
- Inverted-order run -
python -m pdb examples/sentence_retrieval/run_sentence_retrieval_synthetic.py --model_name_or_path bert-base-multilingual-cased --cache_dir /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/transformer_models --train_file=../../data/tatoeba/en/en.json --output_dir=../../data/model_outputs/wikitext/debug --overwrite_output_dir --do_train --invert_word_order --word_modification replace --pool_type middle
- Word modification -
python -m pdb examples/sentence_retrieval/run_sentence_retrieval_synthetic.py --model_name_or_path bert-base-multilingual-cased --cache_dir /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/transformer_models --train_file=../../data/tatoeba/en/en.json --output_dir=../../data/model_outputs/wikitext/debug --overwrite_output_dir --do_train --modify_words --modify_words_probability 0.9 --word_modification replace --pool_type cls
- Bilingual evaluation -
python -m pdb examples/sentence_retrieval/run_sentence_retrieval_synthetic.py --model_name_or_path bert-base-multilingual-cased --cache_dir /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/transformer_models --train_file=../../data/tatoeba/en/en_hi.json --output_dir=../../data/model_outputs/wikitext/debug --overwrite_output_dir --do_train --modify_words --modify_words_probability 0.9 --word_modification replace --pool_type cls --bilingual
- Syntax modification: Use the
--bilingual
flag for this as well. Usetrain_file
as something like/n/fs/nlp-asd/asd/asd/Projects/Multilingual/data/tatoeba/en/dep/synthetic_dep_flattened_en-en~hi@N~hi@V.json
.
- Baseline -
python -m pdb run_qa_synthetic.py --task_name qa --model_name_or_path roberta-base --train_file /n/fs/nlp-asd/asd/asd/Projects/Multilingual/data/xquad/en/dev_en.json --validation_file /n/fs/nlp-asd/asd/asd/Projects/Multilingual/data/xquad/en/dev_en.json --output_dir ../../../../data/model_outputs/pos/debug --do_train --do_eval --cache_dir /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/transformer_models --doc_stride 128 --overwrite_output_dir --num_train_epochs 2
- Word modif -
python -m pdb run_qa_synthetic.py --task_name qa --model_name_or_path roberta-base --train_file /n/fs/nlp-asd/asd/asd/Projects/Multilingual/data/xquad/en/dev_en.json --validation_file /n/fs/nlp-asd/asd/asd/Projects/Multilingual/data/xquad/en/dev_en.json --output_dir ../../../../data/model_outputs/pos/debug --do_train --do_eval --cache_dir /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/transformer_models --doc_stride 128 --overwrite_output_dir --num_train_epochs 2 --modify_words --modify_words_probability 0.9 --word_modification replace
- Inverted order -
python -m pdb run_qa_synthetic.py --task_name qa --model_name_or_path roberta-base --train_file /n/fs/nlp-asd/asd/asd/Projects/Multilingual/data/xquad/en/dev_en.json --validation_file /n/fs/nlp-asd/asd/asd/Projects/Multilingual/data/xquad/en/dev_en.json --output_dir ../../../../data/model_outputs/pos/debug --do_train --do_eval --cache_dir /n/fs/nlp-asd/asd/asd/BERT_Embeddings_Test/BERT_Embeddings_Test/global_data/transformer_models --doc_stride 128 --overwrite_output_dir --num_train_epochs 2 --invert_word_order --word_modification replace