ramanathanlab · KPHippe · Mar 27, 2023 · Mar 13, 2023 · Mar 13, 2023
diff --git a/docs/COMMANDS.md b/docs/COMMANDS.md
@@ -29,7 +29,7 @@ python -m genslm.cmdline.remove_neox_attention_bias \
 2. Setup a config file that looks like this: 
 ```
 load_pt_checkpoint: /home/hippekp/CVD-Mol-AI/hippekp/model_training/25m_genome_embeddings/model-epoch69-val_loss0.01.pt
-tokenizer_file: /home/hippekp/github/genslm/genslm/tokenizer_files/codon_wordlevel_100vocab.json
+tokenizer_file: /home/hippekp/github/genslm/genslm/tokenizer_files/codon_wordlevel_69vocab.json
 data_file: $DATA.h5
 embeddings_out_path: /home/hippekp/CVD-Mol-AI/hippekp/model_training/25m_genome_embeddings/train_embeddings/
 model_config_json: /lus/eagle/projects/CVD-Mol-AI/hippekp/model_training/genome_finetuning_25m/config/neox_25,290,752.json
@@ -64,7 +64,7 @@ Converting a directory of fasta files into a directory of h5 files (Step one of
 python -m genslm.cmdline.fasta_to_h5 \
   --fasta $PATH_TO_FASTA_DIR \
   --h5_dir $PATH_TO_OUTDIR \
-  --tokenizer_file ~/github/genslm/genslm/tokenizer_files/codon_wordlevel_100vocab.json
+  --tokenizer_file ~/github/genslm/genslm/tokenizer_files/codon_wordlevel_69vocab.json
 ```
 
 Converting a directory of h5 files into a single h5 file (Step two of data preprocessing for pretraining, output of this step is what we use for pretraining) 
@@ -83,7 +83,7 @@ Converting individual fasta files into individual h5 files (Useful for getting e
 python -m genslm.cmdline.single_fasta_to_h5 \
   -f $PATH_TO_SINGLE_FASTA \
   --h5 $PATH_TO_SINGLE_H5 \
-  -t ~/github/genslm/genslm/tokenizer_files/codon_wordlevel_100vocab.json \
+  -t ~/github/genslm/genslm/tokenizer_files/codon_wordlevel_69vocab.json \
   -b 10240 \
   -n 16\
   --train_val_test_split

diff --git a/examples/training/covid_models/250M_finetune_first_year.yaml b/examples/training/covid_models/250M_finetune_first_year.yaml
@@ -16,7 +16,7 @@ limit_val_batches: 32
 check_val_every_n_epoch: 1
 checkpoint_every_n_train_steps: 500
 checkpoint_every_n_epochs: null
-tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_100vocab.json
+tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_69vocab.json
 train_file: /path/to/data/first_year/first_year_train.h5
 val_file: /path/to/data/first_year/first_year_val.h5
 test_file: /path/to/data/first_year/first_year_val.h5

diff --git a/examples/training/covid_models/25M_finetune_first_year.yaml b/examples/training/covid_models/25M_finetune_first_year.yaml
@@ -16,7 +16,7 @@ limit_val_batches: 32
 check_val_every_n_epoch: 1
 checkpoint_every_n_train_steps: 500
 checkpoint_every_n_epochs: null
-tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_100vocab.json
+tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_69vocab.json
 train_file: /path/to/data/first_year/first_year_train.h5
 val_file: /path/to/data/first_year/first_year_val.h5
 test_file: /path/to/data/first_year/first_year_val.h5

diff --git a/examples/training/foundation_models/250M_foundation.yaml b/examples/training/foundation_models/250M_foundation.yaml
@@ -15,7 +15,7 @@ limit_val_batches: 32
 check_val_every_n_epoch: 1
 checkpoint_every_n_train_steps: 500
 checkpoint_every_n_epochs: null
-tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_100vocab.json
+tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_69vocab.json
 train_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_train.h5
 val_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_val.h5
 test_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_test.h5

diff --git a/examples/training/foundation_models/25B_foundation.yaml b/examples/training/foundation_models/25B_foundation.yaml
@@ -16,7 +16,7 @@ limit_val_batches: 32
 check_val_every_n_epoch: 1
 checkpoint_every_n_train_steps: 50
 checkpoint_every_n_epochs: null
-tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_100vocab.json
+tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_69vocab.json
 train_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_train.h5
 val_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_val.h5
 test_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_test.h5

diff --git a/examples/training/foundation_models/25M_foundation.yaml b/examples/training/foundation_models/25M_foundation.yaml
@@ -15,7 +15,7 @@ limit_val_batches: 32
 check_val_every_n_epoch: 1
 checkpoint_every_n_train_steps: 500
 checkpoint_every_n_epochs: null
-tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_100vocab.json
+tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_69vocab.json
 train_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_train.h5
 val_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_val.h5
 test_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_test.h5

diff --git a/examples/training/foundation_models/2B_foundation.yaml b/examples/training/foundation_models/2B_foundation.yaml
@@ -2,7 +2,7 @@ wandb_active: true
 wandb_project_name: codon_transformer
 wandb_entity_name: gene_mdh_gan
 checkpoint_dir: patric_2.5B_pretraining/checkpoints_v2/
-tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_100vocab.json
+tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_69vocab.json
 train_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_train.h5
 val_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_val.h5
 test_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_test.h5

diff --git a/genslm/cmdline/process_single_family_file.py b/genslm/cmdline/process_single_family_file.py
@@ -31,7 +31,7 @@ def main(input_fasta: Path, output_h5: Path, tokenizer_path: Path, block_size: i
         "--tokenizer_file",
         help="Path to tokenizer file",
         default=(
-            fp.parent.parent / "genslm/tokenizer_files/codon_wordlevel_100vocab.json"
+            fp.parent.parent / "genslm/tokenizer_files/codon_wordlevel_69vocab.json"
         ),
     )
     parser.add_argument(

diff --git a/genslm/config.py b/genslm/config.py
@@ -131,7 +131,7 @@ class ModelSettings(BaseSettings):
     tokenizer_file: Path = (
         Path(genslm.__file__).parent
         / "tokenizer_files"
-        / "codon_wordlevel_100vocab.json"
+        / "codon_wordlevel_69vocab.json"
     )
     """Path to the tokenizer file."""
     train_file: Path

diff --git a/genslm/inference.py b/genslm/inference.py
@@ -22,25 +22,25 @@ class GenSLM(nn.Module):
     MODELS: Dict[str, Dict[str, str]] = {
         "genslm_25M_patric": {
             "config": str(__architecture_path / "neox" / "neox_25,290,752.json"),
-            "tokenizer": str(__tokenizer_path / "codon_wordlevel_100vocab.json"),
+            "tokenizer": str(__tokenizer_path / "codon_wordlevel_69vocab.json"),
             "weights": "patric_25m_epoch01-val_loss_0.57_bias_removed.pt",
             "seq_length": "2048",
         },
         "genslm_250M_patric": {
             "config": str(__architecture_path / "neox" / "neox_244,464,576.json"),
-            "tokenizer": str(__tokenizer_path / "codon_wordlevel_100vocab.json"),
+            "tokenizer": str(__tokenizer_path / "codon_wordlevel_69vocab.json"),
             "weights": "patric_250m_epoch00_val_loss_0.48_attention_removed.pt",
             "seq_length": "2048",
         },
         "genslm_2.5B_patric": {
             "config": str(__architecture_path / "neox" / "neox_2,533,931,008.json"),
-            "tokenizer": str(__tokenizer_path / "codon_wordlevel_100vocab.json"),
+            "tokenizer": str(__tokenizer_path / "codon_wordlevel_69vocab.json"),
             "weights": "patric_2.5b_epoch00_val_los_0.29_bias_removed.pt",
             "seq_length": "2048",
         },
         "genslm_25B_patric": {
             "config": str(__architecture_path / "neox" / "neox_25,076,188,032.json"),
-            "tokenizer": str(__tokenizer_path / "codon_wordlevel_100vocab.json"),
+            "tokenizer": str(__tokenizer_path / "codon_wordlevel_69vocab.json"),
             "weights": "model-epoch00-val_loss0.70-v2.pt",
             "seq_length": "2048",
         },

diff --git a/...nizer_files/codon_wordlevel_100vocab.json → ...enizer_files/codon_wordlevel_69vocab.json b/...nizer_files/codon_wordlevel_100vocab.json → ...enizer_files/codon_wordlevel_69vocab.json
diff --git a/genslm/tokenizer_files/codon_wordlevel_71vocab.json b/genslm/tokenizer_files/codon_wordlevel_71vocab.json