-
Notifications
You must be signed in to change notification settings - Fork 23
/
config.yaml
24 lines (23 loc) · 1 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# ncbi genomes has ~20% soft-masked compared to ~30% in ensembl plants
# which I think may be excessive
FASTA_URL: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/735/GCF_000001735.4_TAIR10.1/GCF_000001735.4_TAIR10.1_genomic.fna.gz"
GTF_URL: "https://ftp.ensemblgenomes.org/pub/plants/release-55/gff3/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.55.gff3.gz"
# MLM dataset creation from ncbi
raw_assemblies_path: "input/assembly_list/brassicales_annotated.tsv"
assemblies_path: "input/assembly_list/brassicales_annotated_filt.tsv"
proportion_train: 1.0
proportion_validation: 0.0
proportion_test: 0.0
window_size: 512
step_size: 256
add_rc: True
promoter_upstream: 1000
target_assembly: "GCF_000001735.4"
subsample_to_target: True
whitelist_validation_chroms:
- "NC_003075.7" # Arabidopsis thaliana chr4
whitelist_test_chroms:
- "NC_003076.8" # Arabidopsis thaliana chr5
# It's good to have at least num_cpus shards to increase parallel loading speed
# of iterable datasets from HF hub
samples_per_file: 500_000