From d0d343bc119cd4a4953bc8ad25dbd50b29654785 Mon Sep 17 00:00:00 2001 From: Kyle Hippe Date: Mon, 13 Mar 2023 16:36:07 -0500 Subject: [PATCH 1/2] Updating tokenizer to reflect actual vocab count, adding updated tokenizer with BOS EOS tokens --- ...ocab.json => codon_wordlevel_69vocab.json} | 0 .../codon_wordlevel_71vocab.json | 209 ++++++++++++++++++ 2 files changed, 209 insertions(+) rename genslm/tokenizer_files/{codon_wordlevel_100vocab.json => codon_wordlevel_69vocab.json} (100%) create mode 100644 genslm/tokenizer_files/codon_wordlevel_71vocab.json diff --git a/genslm/tokenizer_files/codon_wordlevel_100vocab.json b/genslm/tokenizer_files/codon_wordlevel_69vocab.json similarity index 100% rename from genslm/tokenizer_files/codon_wordlevel_100vocab.json rename to genslm/tokenizer_files/codon_wordlevel_69vocab.json diff --git a/genslm/tokenizer_files/codon_wordlevel_71vocab.json b/genslm/tokenizer_files/codon_wordlevel_71vocab.json new file mode 100644 index 00000000..388b7120 --- /dev/null +++ b/genslm/tokenizer_files/codon_wordlevel_71vocab.json @@ -0,0 +1,209 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 0, + "content": "[UNK]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 1, + "content": "[CLS]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2, + "content": "[BOS]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 3, + "content": "[EOS]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 4, + "content": "[SEP]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 5, + "content": "[PAD]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 6, + "content": "[MASK]", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": null, + "pre_tokenizer": { + "type": "Whitespace" + }, + "post_processor": { + "type": "TemplateProcessing", + "single": [ + { + "SpecialToken": { + "id": "[BOS]", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "SpecialToken": { + "id": "[EOS]", + "type_id": 0 + } + } + ], + "pair": [ + { + "Sequence": { + "id": "A", + "type_id": 0 + } + }, + { + "Sequence": { + "id": "B", + "type_id": 1 + } + } + ], + "special_tokens": { + "[BOS]": { + "id": "[BOS]", + "ids": [ + 2 + ], + "tokens": [ + "[BOS]" + ] + }, + "[EOS]": { + "id": "[EOS]", + "ids": [ + 3 + ], + "tokens": [ + "[EOS]" + ] + } + } + }, + "decoder": null, + "model": { + "type": "WordLevel", + "vocab": { + "[UNK]": 0, + "[CLS]": 1, + "[BOS]": 2, + "[EOS]": 3, + "[SEP]": 4, + "[PAD]": 5, + "[MASK]": 6, + "GGC": 7, + "GCC": 8, + "ATC": 9, + "GAC": 10, + "GAA": 11, + "ATG": 12, + "GTG": 13, + "CTG": 14, + "GTC": 15, + "GCG": 16, + "GAT": 17, + "AAA": 18, + "GGT": 19, + "AAG": 20, + "GAG": 21, + "ACC": 22, + "AAC": 23, + "GTT": 24, + "ATT": 25, + "GCA": 26, + "CTC": 27, + "CGC": 28, + "GCT": 29, + "CAG": 30, + "CCG": 31, + "TTC": 32, + "GTA": 33, + "TCG": 34, + "GGA": 35, + "AAT": 36, + "TAC": 37, + "CTT": 38, + "TTG": 39, + "ACG": 40, + "TCC": 41, + "GGG": 42, + "AGC": 43, + "CCC": 44, + "ACA": 45, + "ACT": 46, + "TCT": 47, + "TTA": 48, + "CGT": 49, + "TAT": 50, + "CAA": 51, + "CGG": 52, + "TTT": 53, + "CAC": 54, + "CCT": 55, + "CCA": 56, + "TGG": 57, + "ATA": 58, + "TCA": 59, + "TGC": 60, + "AGT": 61, + "AGA": 62, + "CAT": 63, + "TGT": 64, + "CTA": 65, + "AGG": 66, + "TAA": 67, + "CGA": 68, + "TGA": 69, + "TAG": 70 + }, + "unk_token": "[UNK]" + } +} \ No newline at end of file From d773d47919930a01f4e615ad317b8376506269e1 Mon Sep 17 00:00:00 2001 From: Kyle Hippe Date: Mon, 13 Mar 2023 17:17:06 -0500 Subject: [PATCH 2/2] Did not add all files to previous commit... oops --- docs/COMMANDS.md | 6 +++--- .../training/covid_models/250M_finetune_first_year.yaml | 2 +- .../training/covid_models/25M_finetune_first_year.yaml | 2 +- examples/training/foundation_models/250M_foundation.yaml | 2 +- examples/training/foundation_models/25B_foundation.yaml | 2 +- examples/training/foundation_models/25M_foundation.yaml | 2 +- examples/training/foundation_models/2B_foundation.yaml | 2 +- genslm/cmdline/process_single_family_file.py | 2 +- genslm/config.py | 2 +- genslm/inference.py | 8 ++++---- 10 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/COMMANDS.md b/docs/COMMANDS.md index fed1bb9c..806fc506 100644 --- a/docs/COMMANDS.md +++ b/docs/COMMANDS.md @@ -29,7 +29,7 @@ python -m genslm.cmdline.remove_neox_attention_bias \ 2. Setup a config file that looks like this: ``` load_pt_checkpoint: /home/hippekp/CVD-Mol-AI/hippekp/model_training/25m_genome_embeddings/model-epoch69-val_loss0.01.pt -tokenizer_file: /home/hippekp/github/genslm/genslm/tokenizer_files/codon_wordlevel_100vocab.json +tokenizer_file: /home/hippekp/github/genslm/genslm/tokenizer_files/codon_wordlevel_69vocab.json data_file: $DATA.h5 embeddings_out_path: /home/hippekp/CVD-Mol-AI/hippekp/model_training/25m_genome_embeddings/train_embeddings/ model_config_json: /lus/eagle/projects/CVD-Mol-AI/hippekp/model_training/genome_finetuning_25m/config/neox_25,290,752.json @@ -64,7 +64,7 @@ Converting a directory of fasta files into a directory of h5 files (Step one of python -m genslm.cmdline.fasta_to_h5 \ --fasta $PATH_TO_FASTA_DIR \ --h5_dir $PATH_TO_OUTDIR \ - --tokenizer_file ~/github/genslm/genslm/tokenizer_files/codon_wordlevel_100vocab.json + --tokenizer_file ~/github/genslm/genslm/tokenizer_files/codon_wordlevel_69vocab.json ``` Converting a directory of h5 files into a single h5 file (Step two of data preprocessing for pretraining, output of this step is what we use for pretraining) @@ -83,7 +83,7 @@ Converting individual fasta files into individual h5 files (Useful for getting e python -m genslm.cmdline.single_fasta_to_h5 \ -f $PATH_TO_SINGLE_FASTA \ --h5 $PATH_TO_SINGLE_H5 \ - -t ~/github/genslm/genslm/tokenizer_files/codon_wordlevel_100vocab.json \ + -t ~/github/genslm/genslm/tokenizer_files/codon_wordlevel_69vocab.json \ -b 10240 \ -n 16\ --train_val_test_split diff --git a/examples/training/covid_models/250M_finetune_first_year.yaml b/examples/training/covid_models/250M_finetune_first_year.yaml index d3a616c4..01ed8bb3 100644 --- a/examples/training/covid_models/250M_finetune_first_year.yaml +++ b/examples/training/covid_models/250M_finetune_first_year.yaml @@ -16,7 +16,7 @@ limit_val_batches: 32 check_val_every_n_epoch: 1 checkpoint_every_n_train_steps: 500 checkpoint_every_n_epochs: null -tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_100vocab.json +tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_69vocab.json train_file: /path/to/data/first_year/first_year_train.h5 val_file: /path/to/data/first_year/first_year_val.h5 test_file: /path/to/data/first_year/first_year_val.h5 diff --git a/examples/training/covid_models/25M_finetune_first_year.yaml b/examples/training/covid_models/25M_finetune_first_year.yaml index 4e52424b..81ae29fd 100644 --- a/examples/training/covid_models/25M_finetune_first_year.yaml +++ b/examples/training/covid_models/25M_finetune_first_year.yaml @@ -16,7 +16,7 @@ limit_val_batches: 32 check_val_every_n_epoch: 1 checkpoint_every_n_train_steps: 500 checkpoint_every_n_epochs: null -tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_100vocab.json +tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_69vocab.json train_file: /path/to/data/first_year/first_year_train.h5 val_file: /path/to/data/first_year/first_year_val.h5 test_file: /path/to/data/first_year/first_year_val.h5 diff --git a/examples/training/foundation_models/250M_foundation.yaml b/examples/training/foundation_models/250M_foundation.yaml index 905943c1..75c7a573 100644 --- a/examples/training/foundation_models/250M_foundation.yaml +++ b/examples/training/foundation_models/250M_foundation.yaml @@ -15,7 +15,7 @@ limit_val_batches: 32 check_val_every_n_epoch: 1 checkpoint_every_n_train_steps: 500 checkpoint_every_n_epochs: null -tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_100vocab.json +tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_69vocab.json train_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_train.h5 val_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_val.h5 test_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_test.h5 diff --git a/examples/training/foundation_models/25B_foundation.yaml b/examples/training/foundation_models/25B_foundation.yaml index ef11d56d..42edac10 100644 --- a/examples/training/foundation_models/25B_foundation.yaml +++ b/examples/training/foundation_models/25B_foundation.yaml @@ -16,7 +16,7 @@ limit_val_batches: 32 check_val_every_n_epoch: 1 checkpoint_every_n_train_steps: 50 checkpoint_every_n_epochs: null -tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_100vocab.json +tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_69vocab.json train_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_train.h5 val_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_val.h5 test_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_test.h5 diff --git a/examples/training/foundation_models/25M_foundation.yaml b/examples/training/foundation_models/25M_foundation.yaml index f6ff257b..336aae39 100644 --- a/examples/training/foundation_models/25M_foundation.yaml +++ b/examples/training/foundation_models/25M_foundation.yaml @@ -15,7 +15,7 @@ limit_val_batches: 32 check_val_every_n_epoch: 1 checkpoint_every_n_train_steps: 500 checkpoint_every_n_epochs: null -tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_100vocab.json +tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_69vocab.json train_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_train.h5 val_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_val.h5 test_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_test.h5 diff --git a/examples/training/foundation_models/2B_foundation.yaml b/examples/training/foundation_models/2B_foundation.yaml index 499f4acd..bc69f42a 100644 --- a/examples/training/foundation_models/2B_foundation.yaml +++ b/examples/training/foundation_models/2B_foundation.yaml @@ -2,7 +2,7 @@ wandb_active: true wandb_project_name: codon_transformer wandb_entity_name: gene_mdh_gan checkpoint_dir: patric_2.5B_pretraining/checkpoints_v2/ -tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_100vocab.json +tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_69vocab.json train_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_train.h5 val_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_val.h5 test_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_test.h5 diff --git a/genslm/cmdline/process_single_family_file.py b/genslm/cmdline/process_single_family_file.py index 45b365cb..bd34396c 100644 --- a/genslm/cmdline/process_single_family_file.py +++ b/genslm/cmdline/process_single_family_file.py @@ -31,7 +31,7 @@ def main(input_fasta: Path, output_h5: Path, tokenizer_path: Path, block_size: i "--tokenizer_file", help="Path to tokenizer file", default=( - fp.parent.parent / "genslm/tokenizer_files/codon_wordlevel_100vocab.json" + fp.parent.parent / "genslm/tokenizer_files/codon_wordlevel_69vocab.json" ), ) parser.add_argument( diff --git a/genslm/config.py b/genslm/config.py index 9450cf3d..93a3620d 100644 --- a/genslm/config.py +++ b/genslm/config.py @@ -131,7 +131,7 @@ class ModelSettings(BaseSettings): tokenizer_file: Path = ( Path(genslm.__file__).parent / "tokenizer_files" - / "codon_wordlevel_100vocab.json" + / "codon_wordlevel_69vocab.json" ) """Path to the tokenizer file.""" train_file: Path diff --git a/genslm/inference.py b/genslm/inference.py index 51a8b6e4..7d0cc338 100644 --- a/genslm/inference.py +++ b/genslm/inference.py @@ -22,25 +22,25 @@ class GenSLM(nn.Module): MODELS: Dict[str, Dict[str, str]] = { "genslm_25M_patric": { "config": str(__architecture_path / "neox" / "neox_25,290,752.json"), - "tokenizer": str(__tokenizer_path / "codon_wordlevel_100vocab.json"), + "tokenizer": str(__tokenizer_path / "codon_wordlevel_69vocab.json"), "weights": "patric_25m_epoch01-val_loss_0.57_bias_removed.pt", "seq_length": "2048", }, "genslm_250M_patric": { "config": str(__architecture_path / "neox" / "neox_244,464,576.json"), - "tokenizer": str(__tokenizer_path / "codon_wordlevel_100vocab.json"), + "tokenizer": str(__tokenizer_path / "codon_wordlevel_69vocab.json"), "weights": "patric_250m_epoch00_val_loss_0.48_attention_removed.pt", "seq_length": "2048", }, "genslm_2.5B_patric": { "config": str(__architecture_path / "neox" / "neox_2,533,931,008.json"), - "tokenizer": str(__tokenizer_path / "codon_wordlevel_100vocab.json"), + "tokenizer": str(__tokenizer_path / "codon_wordlevel_69vocab.json"), "weights": "patric_2.5b_epoch00_val_los_0.29_bias_removed.pt", "seq_length": "2048", }, "genslm_25B_patric": { "config": str(__architecture_path / "neox" / "neox_25,076,188,032.json"), - "tokenizer": str(__tokenizer_path / "codon_wordlevel_100vocab.json"), + "tokenizer": str(__tokenizer_path / "codon_wordlevel_69vocab.json"), "weights": "model-epoch00-val_loss0.70-v2.pt", "seq_length": "2048", },