From d0d343bc119cd4a4953bc8ad25dbd50b29654785 Mon Sep 17 00:00:00 2001
From: Kyle Hippe <kyle.hippe@gmail.com>
Date: Mon, 13 Mar 2023 16:36:07 -0500
Subject: [PATCH 1/2] Updating tokenizer to reflect actual vocab count, adding
 updated tokenizer with BOS EOS tokens

---
 ...ocab.json => codon_wordlevel_69vocab.json} |   0
 .../codon_wordlevel_71vocab.json              | 209 ++++++++++++++++++
 2 files changed, 209 insertions(+)
 rename genslm/tokenizer_files/{codon_wordlevel_100vocab.json => codon_wordlevel_69vocab.json} (100%)
 create mode 100644 genslm/tokenizer_files/codon_wordlevel_71vocab.json

diff --git a/genslm/tokenizer_files/codon_wordlevel_100vocab.json b/genslm/tokenizer_files/codon_wordlevel_69vocab.json
similarity index 100%
rename from genslm/tokenizer_files/codon_wordlevel_100vocab.json
rename to genslm/tokenizer_files/codon_wordlevel_69vocab.json
diff --git a/genslm/tokenizer_files/codon_wordlevel_71vocab.json b/genslm/tokenizer_files/codon_wordlevel_71vocab.json
new file mode 100644
index 00000000..388b7120
--- /dev/null
+++ b/genslm/tokenizer_files/codon_wordlevel_71vocab.json
@@ -0,0 +1,209 @@
+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "[UNK]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "[CLS]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "[BOS]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "[EOS]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 4,
+      "content": "[SEP]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 5,
+      "content": "[PAD]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 6,
+      "content": "[MASK]",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": null,
+  "pre_tokenizer": {
+    "type": "Whitespace"
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "[BOS]",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "[EOS]",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "[BOS]": {
+        "id": "[BOS]",
+        "ids": [
+          2
+        ],
+        "tokens": [
+          "[BOS]"
+        ]
+      },
+      "[EOS]": {
+        "id": "[EOS]",
+        "ids": [
+          3
+        ],
+        "tokens": [
+          "[EOS]"
+        ]
+      }
+    }
+  },
+  "decoder": null,
+  "model": {
+    "type": "WordLevel",
+    "vocab": {
+      "[UNK]": 0,
+      "[CLS]": 1,
+      "[BOS]": 2,
+      "[EOS]": 3,
+      "[SEP]": 4,
+      "[PAD]": 5,
+      "[MASK]": 6,
+      "GGC": 7,
+      "GCC": 8,
+      "ATC": 9,
+      "GAC": 10,
+      "GAA": 11,
+      "ATG": 12,
+      "GTG": 13,
+      "CTG": 14,
+      "GTC": 15,
+      "GCG": 16,
+      "GAT": 17,
+      "AAA": 18,
+      "GGT": 19,
+      "AAG": 20,
+      "GAG": 21,
+      "ACC": 22,
+      "AAC": 23,
+      "GTT": 24,
+      "ATT": 25,
+      "GCA": 26,
+      "CTC": 27,
+      "CGC": 28,
+      "GCT": 29,
+      "CAG": 30,
+      "CCG": 31,
+      "TTC": 32,
+      "GTA": 33,
+      "TCG": 34,
+      "GGA": 35,
+      "AAT": 36,
+      "TAC": 37,
+      "CTT": 38,
+      "TTG": 39,
+      "ACG": 40,
+      "TCC": 41,
+      "GGG": 42,
+      "AGC": 43,
+      "CCC": 44,
+      "ACA": 45,
+      "ACT": 46,
+      "TCT": 47,
+      "TTA": 48,
+      "CGT": 49,
+      "TAT": 50,
+      "CAA": 51,
+      "CGG": 52,
+      "TTT": 53,
+      "CAC": 54,
+      "CCT": 55,
+      "CCA": 56,
+      "TGG": 57,
+      "ATA": 58,
+      "TCA": 59,
+      "TGC": 60,
+      "AGT": 61,
+      "AGA": 62,
+      "CAT": 63,
+      "TGT": 64,
+      "CTA": 65,
+      "AGG": 66,
+      "TAA": 67,
+      "CGA": 68,
+      "TGA": 69,
+      "TAG": 70
+    },
+    "unk_token": "[UNK]"
+  }
+}
\ No newline at end of file

From d773d47919930a01f4e615ad317b8376506269e1 Mon Sep 17 00:00:00 2001
From: Kyle Hippe <kyle.hippe@gmail.com>
Date: Mon, 13 Mar 2023 17:17:06 -0500
Subject: [PATCH 2/2] Did not add all files to previous commit... oops

---
 docs/COMMANDS.md                                          | 6 +++---
 .../training/covid_models/250M_finetune_first_year.yaml   | 2 +-
 .../training/covid_models/25M_finetune_first_year.yaml    | 2 +-
 examples/training/foundation_models/250M_foundation.yaml  | 2 +-
 examples/training/foundation_models/25B_foundation.yaml   | 2 +-
 examples/training/foundation_models/25M_foundation.yaml   | 2 +-
 examples/training/foundation_models/2B_foundation.yaml    | 2 +-
 genslm/cmdline/process_single_family_file.py              | 2 +-
 genslm/config.py                                          | 2 +-
 genslm/inference.py                                       | 8 ++++----
 10 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/docs/COMMANDS.md b/docs/COMMANDS.md
index fed1bb9c..806fc506 100644
--- a/docs/COMMANDS.md
+++ b/docs/COMMANDS.md
@@ -29,7 +29,7 @@ python -m genslm.cmdline.remove_neox_attention_bias \
 2. Setup a config file that looks like this: 
 ```
 load_pt_checkpoint: /home/hippekp/CVD-Mol-AI/hippekp/model_training/25m_genome_embeddings/model-epoch69-val_loss0.01.pt
-tokenizer_file: /home/hippekp/github/genslm/genslm/tokenizer_files/codon_wordlevel_100vocab.json
+tokenizer_file: /home/hippekp/github/genslm/genslm/tokenizer_files/codon_wordlevel_69vocab.json
 data_file: $DATA.h5
 embeddings_out_path: /home/hippekp/CVD-Mol-AI/hippekp/model_training/25m_genome_embeddings/train_embeddings/
 model_config_json: /lus/eagle/projects/CVD-Mol-AI/hippekp/model_training/genome_finetuning_25m/config/neox_25,290,752.json
@@ -64,7 +64,7 @@ Converting a directory of fasta files into a directory of h5 files (Step one of
 python -m genslm.cmdline.fasta_to_h5 \
   --fasta $PATH_TO_FASTA_DIR \
   --h5_dir $PATH_TO_OUTDIR \
-  --tokenizer_file ~/github/genslm/genslm/tokenizer_files/codon_wordlevel_100vocab.json
+  --tokenizer_file ~/github/genslm/genslm/tokenizer_files/codon_wordlevel_69vocab.json
 ```
 
 Converting a directory of h5 files into a single h5 file (Step two of data preprocessing for pretraining, output of this step is what we use for pretraining) 
@@ -83,7 +83,7 @@ Converting individual fasta files into individual h5 files (Useful for getting e
 python -m genslm.cmdline.single_fasta_to_h5 \
   -f $PATH_TO_SINGLE_FASTA \
   --h5 $PATH_TO_SINGLE_H5 \
-  -t ~/github/genslm/genslm/tokenizer_files/codon_wordlevel_100vocab.json \
+  -t ~/github/genslm/genslm/tokenizer_files/codon_wordlevel_69vocab.json \
   -b 10240 \
   -n 16\
   --train_val_test_split
diff --git a/examples/training/covid_models/250M_finetune_first_year.yaml b/examples/training/covid_models/250M_finetune_first_year.yaml
index d3a616c4..01ed8bb3 100644
--- a/examples/training/covid_models/250M_finetune_first_year.yaml
+++ b/examples/training/covid_models/250M_finetune_first_year.yaml
@@ -16,7 +16,7 @@ limit_val_batches: 32
 check_val_every_n_epoch: 1
 checkpoint_every_n_train_steps: 500
 checkpoint_every_n_epochs: null
-tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_100vocab.json
+tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_69vocab.json
 train_file: /path/to/data/first_year/first_year_train.h5
 val_file: /path/to/data/first_year/first_year_val.h5
 test_file: /path/to/data/first_year/first_year_val.h5
diff --git a/examples/training/covid_models/25M_finetune_first_year.yaml b/examples/training/covid_models/25M_finetune_first_year.yaml
index 4e52424b..81ae29fd 100644
--- a/examples/training/covid_models/25M_finetune_first_year.yaml
+++ b/examples/training/covid_models/25M_finetune_first_year.yaml
@@ -16,7 +16,7 @@ limit_val_batches: 32
 check_val_every_n_epoch: 1
 checkpoint_every_n_train_steps: 500
 checkpoint_every_n_epochs: null
-tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_100vocab.json
+tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_69vocab.json
 train_file: /path/to/data/first_year/first_year_train.h5
 val_file: /path/to/data/first_year/first_year_val.h5
 test_file: /path/to/data/first_year/first_year_val.h5
diff --git a/examples/training/foundation_models/250M_foundation.yaml b/examples/training/foundation_models/250M_foundation.yaml
index 905943c1..75c7a573 100644
--- a/examples/training/foundation_models/250M_foundation.yaml
+++ b/examples/training/foundation_models/250M_foundation.yaml
@@ -15,7 +15,7 @@ limit_val_batches: 32
 check_val_every_n_epoch: 1
 checkpoint_every_n_train_steps: 500
 checkpoint_every_n_epochs: null
-tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_100vocab.json
+tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_69vocab.json
 train_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_train.h5
 val_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_val.h5
 test_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_test.h5
diff --git a/examples/training/foundation_models/25B_foundation.yaml b/examples/training/foundation_models/25B_foundation.yaml
index ef11d56d..42edac10 100644
--- a/examples/training/foundation_models/25B_foundation.yaml
+++ b/examples/training/foundation_models/25B_foundation.yaml
@@ -16,7 +16,7 @@ limit_val_batches: 32
 check_val_every_n_epoch: 1
 checkpoint_every_n_train_steps: 50
 checkpoint_every_n_epochs: null
-tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_100vocab.json
+tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_69vocab.json
 train_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_train.h5
 val_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_val.h5
 test_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_test.h5
diff --git a/examples/training/foundation_models/25M_foundation.yaml b/examples/training/foundation_models/25M_foundation.yaml
index f6ff257b..336aae39 100644
--- a/examples/training/foundation_models/25M_foundation.yaml
+++ b/examples/training/foundation_models/25M_foundation.yaml
@@ -15,7 +15,7 @@ limit_val_batches: 32
 check_val_every_n_epoch: 1
 checkpoint_every_n_train_steps: 500
 checkpoint_every_n_epochs: null
-tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_100vocab.json
+tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_69vocab.json
 train_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_train.h5
 val_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_val.h5
 test_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_test.h5
diff --git a/examples/training/foundation_models/2B_foundation.yaml b/examples/training/foundation_models/2B_foundation.yaml
index 499f4acd..bc69f42a 100644
--- a/examples/training/foundation_models/2B_foundation.yaml
+++ b/examples/training/foundation_models/2B_foundation.yaml
@@ -2,7 +2,7 @@ wandb_active: true
 wandb_project_name: codon_transformer
 wandb_entity_name: gene_mdh_gan
 checkpoint_dir: patric_2.5B_pretraining/checkpoints_v2/
-tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_100vocab.json
+tokenizer_file: ../../genslm/tokenizer_files/codon_wordlevel_69vocab.json
 train_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_train.h5
 val_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_val.h5
 test_file: /path/to/data/patric_89M/pgfam_30k_h5_tts/combined_test.h5
diff --git a/genslm/cmdline/process_single_family_file.py b/genslm/cmdline/process_single_family_file.py
index 45b365cb..bd34396c 100644
--- a/genslm/cmdline/process_single_family_file.py
+++ b/genslm/cmdline/process_single_family_file.py
@@ -31,7 +31,7 @@ def main(input_fasta: Path, output_h5: Path, tokenizer_path: Path, block_size: i
         "--tokenizer_file",
         help="Path to tokenizer file",
         default=(
-            fp.parent.parent / "genslm/tokenizer_files/codon_wordlevel_100vocab.json"
+            fp.parent.parent / "genslm/tokenizer_files/codon_wordlevel_69vocab.json"
         ),
     )
     parser.add_argument(
diff --git a/genslm/config.py b/genslm/config.py
index 9450cf3d..93a3620d 100644
--- a/genslm/config.py
+++ b/genslm/config.py
@@ -131,7 +131,7 @@ class ModelSettings(BaseSettings):
     tokenizer_file: Path = (
         Path(genslm.__file__).parent
         / "tokenizer_files"
-        / "codon_wordlevel_100vocab.json"
+        / "codon_wordlevel_69vocab.json"
     )
     """Path to the tokenizer file."""
     train_file: Path
diff --git a/genslm/inference.py b/genslm/inference.py
index 51a8b6e4..7d0cc338 100644
--- a/genslm/inference.py
+++ b/genslm/inference.py
@@ -22,25 +22,25 @@ class GenSLM(nn.Module):
     MODELS: Dict[str, Dict[str, str]] = {
         "genslm_25M_patric": {
             "config": str(__architecture_path / "neox" / "neox_25,290,752.json"),
-            "tokenizer": str(__tokenizer_path / "codon_wordlevel_100vocab.json"),
+            "tokenizer": str(__tokenizer_path / "codon_wordlevel_69vocab.json"),
             "weights": "patric_25m_epoch01-val_loss_0.57_bias_removed.pt",
             "seq_length": "2048",
         },
         "genslm_250M_patric": {
             "config": str(__architecture_path / "neox" / "neox_244,464,576.json"),
-            "tokenizer": str(__tokenizer_path / "codon_wordlevel_100vocab.json"),
+            "tokenizer": str(__tokenizer_path / "codon_wordlevel_69vocab.json"),
             "weights": "patric_250m_epoch00_val_loss_0.48_attention_removed.pt",
             "seq_length": "2048",
         },
         "genslm_2.5B_patric": {
             "config": str(__architecture_path / "neox" / "neox_2,533,931,008.json"),
-            "tokenizer": str(__tokenizer_path / "codon_wordlevel_100vocab.json"),
+            "tokenizer": str(__tokenizer_path / "codon_wordlevel_69vocab.json"),
             "weights": "patric_2.5b_epoch00_val_los_0.29_bias_removed.pt",
             "seq_length": "2048",
         },
         "genslm_25B_patric": {
             "config": str(__architecture_path / "neox" / "neox_25,076,188,032.json"),
-            "tokenizer": str(__tokenizer_path / "codon_wordlevel_100vocab.json"),
+            "tokenizer": str(__tokenizer_path / "codon_wordlevel_69vocab.json"),
             "weights": "model-epoch00-val_loss0.70-v2.pt",
             "seq_length": "2048",
         },