diff --git a/.gitignore b/.gitignore index 084287c7..07984913 100644 --- a/.gitignore +++ b/.gitignore @@ -79,3 +79,19 @@ docker/ mounted_folder/ docker_dev/ +CMakeCache.txt +CMakeFiles +CTestTestfile.cmake +ext/gatb.tar.gz +gatb-prefix +ext/gatb-core-1.4.1 +gtest +include/gatb +include/hdf5 +include/json +include/BooPHF +pandora.cbp +test/cmake_install.cmake +test/estimate_parameters_test +test/pandora_test +coverage diff --git a/README.md b/README.md index 03273504..857d243f 100644 --- a/README.md +++ b/README.md @@ -99,40 +99,55 @@ If you haven't, you will need a multiple sequence alignment for each graph. Prec ### Build index Takes a fasta-like file of PanRG sequences and constructs an index, and a directory of gfa files to be used by `pandora map` or `pandora compare`. These are output in the same directory as the PanRG file. - Usage: pandora index [options] - Options: +``` +$ pandora index --help +Usage: pandora index [options] +Options: -h,--help Show this help message -w W Window size for (w,k)-minimizers, default 14 -k K K-mer size for (w,k)-minimizers, default 15 - -t T Number of concurrent threads, default 1 + -t T Number of threads, default 1 + --offset Offset for PRG ids, default 0 + --outfile Filename for index + --log_level debug,[info],warning,error +``` The index stores (w,k)-minimizers for each PanRG path found. These parameters can be specified, but default to w=14, k=15. ### Map reads to index This takes a fasta/q of Nanopore or Illumina reads and compares to the index. It infers which of the PanRG genes/elements is present, and for those that are present it outputs the inferred sequence and a genotyped VCF. - Usage: pandora map -p PanRG_FILE -r READ_FILE -o OUTDIR - Options: - -h,--help Show this help message - -p,--prg_file PanRG_FILE Specify a fasta-style PanRG file - -r,--read_file READ_FILE Specify a file of reads in fasta/q format - -o,--outdir OUTDIR Specify directory of output - -w W Window size for (w,k)-minimizers, must be <=k, default 14 - -k K K-mer size for (w,k)-minimizers, default 15 - -m,--max_diff INT Maximum distance between consecutive hits within a cluster, default 250 bps - -e,--error_rate FLOAT Estimated error rate for reads, default 0.11/0.001 for Nanopore/Illumina - -c,--min_cluster_size INT Minimum number of hits in a cluster to consider a locus present, default 10 - --genome_size NUM_BP Estimated length of genome, used for coverage estimation, default 5000000 - --vcf_refs REF_FASTA A fasta file with an entry for each loci in the PanRG in order, giving - reference sequence to be used as VCF ref. Must have a perfect match to a - path in the graph and the same name as the locus in the graph. - --illumina Data is from Illumina, not Nanopore, so is shorter with low error rate - --bin Use binomial model for kmer coverages, default is negative binomial - --max_covg INT Maximum average coverage from reads to accept, default first 300 - --genotype Output a genotyped VCF - --discover Add denovo discovery - --denovo_kmer_size INT Kmer size to use for denovo discovery, default 11 - --log_level LEVEL Verbosity for logging, use "debug" for more output +``` +$ pandora map --help +Usage: pandora map -p PRG_FILE -r READ_FILE +Options: + -h,--help Show this help message + -p,--prg_file PRG_FILE Specify a fasta-style prg file + -r,--read_file READ_FILE Specify a file of reads in fasta format + -o,--outdir OUTDIR Specify directory of output + -w W Window size for (w,k)-minimizers, must be <=k, default 14 + -k K K-mer size for (w,k)-minimizers, default 15 + -m,--max_diff INT Maximum distance between consecutive hits within a cluster, default 500 (bps) + -e,--error_rate FLOAT Estimated error rate for reads, default 0.11 + -t T Number of threads, default 1 + --genome_size NUM_BP Estimated length of genome, used for coverage estimation + --output_kg Save kmer graphs with fwd and rev coverage annotations for found localPRGs + --output_vcf Save a vcf file for each found localPRG + --vcf_refs REF_FASTA A fasta file with an entry for each LocalPRG giving reference sequence for + VCF. Must have a perfect match in the graph and the same name as the graph + --output_comparison_paths Save a fasta file for a random selection of paths through localPRG + --output_covgs Save a file of covgs for each localPRG present, one number per base of fasta file + --output_mapped_read_fa Save a file for each gene containing read parts which overlapped it + --illumina Data is from illumina rather than nanopore, so is shorter with low error rate + --clean Add a step to clean and detangle the pangraph + --bin Use binomial model for kmer coverages, default is negative binomial + --max_covg Maximum average coverage from reads to accept + --genotype MODE Add extra step to carefully genotype sites. Has two modes: global (ML path oriented) or local (coverage oriented) + --snps_only When genotyping, include only snp sites + -d,--discover Add denovo discovery + --denovo_kmer_size Kmer size to use for denovo discovery + --log_level debug,[info],warning,error +``` ### Compare reads from several samples This takes Nanopore or Illumina read fasta/q for a number of samples, mapping each to the index. It infers which of the PanRG genes/elements is present in each sample, and outputs a presence/absence pangenome matrix, the inferred sequences for each sample and a genotyped multisample pangenome VCF. diff --git a/include/fastaq_handler.h b/include/fastaq_handler.h index 61f508c7..4d5d9d2b 100644 --- a/include/fastaq_handler.h +++ b/include/fastaq_handler.h @@ -4,36 +4,44 @@ #include #include #include +#include +#include #include #include #include #include +#include +#include +#include "kseq.h" +KSEQ_INIT(gzFile, gzread) namespace logging = boost::log; struct FastaqHandler { - bool gzipped; - std::ifstream fastaq_file; - boost::iostreams::filtering_istreambuf inbuf; - std::istream instream; - std::string line; +private: + bool closed; + kseq_t* inbuf; + +public: + const std::string filepath; + gzFile fastaq_file; std::string name; std::string read; uint32_t num_reads_parsed; - FastaqHandler(const std::string&); + FastaqHandler(const std::string); ~FastaqHandler(); - bool eof(); + bool eof() const; void get_next(); - void skip_next(); - - void get_id(const uint32_t&); + void get_nth_read(const uint32_t& idx); void close(); + + bool is_closed() const; }; #endif diff --git a/include/kseq.h b/include/kseq.h new file mode 100644 index 00000000..2f94a64f --- /dev/null +++ b/include/kseq.h @@ -0,0 +1,242 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* Last Modified: 05MAR2012 */ + +#ifndef AC_KSEQ_H +#define AC_KSEQ_H + +#include +#include +#include + +#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r +#define KS_SEP_TAB 1 // isspace() && !' ' +#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) +#define KS_SEP_MAX 2 + +#define __KS_TYPE(type_t) \ + typedef struct __kstream_t { \ + unsigned char *buf; \ + int begin, end, is_eof; \ + type_t f; \ + } kstream_t; + +#define ks_err(ks) ((ks)->end == -1) +#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) +#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) + +#define __KS_BASIC(type_t, __bufsize) \ + static inline kstream_t *ks_init(type_t f) \ + { \ + kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ + ks->f = f; \ + ks->buf = (unsigned char*)malloc(__bufsize); \ + return ks; \ + } \ + static inline void ks_destroy(kstream_t *ks) \ + { \ + if (ks) { \ + free(ks->buf); \ + free(ks); \ + } \ + } + +#define __KS_GETC(__read, __bufsize) \ + static inline int ks_getc(kstream_t *ks) \ + { \ + if (ks_err(ks)) return -3; \ + if (ks->is_eof && ks->begin >= ks->end) return -1; \ + if (ks->begin >= ks->end) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end == 0) { ks->is_eof = 1; return -1;} \ + if (ks->end == -1) { ks->is_eof = 1; return -3;}\ + } \ + return (int)ks->buf[ks->begin++]; \ + } + +#ifndef KSTRING_T +#define KSTRING_T kstring_t +typedef struct __kstring_t { + size_t l, m; + char *s; +} kstring_t; +#endif + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +#define __KS_GETUNTIL(__read, __bufsize) \ + static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ + { \ + int gotany = 0; \ + if (dret) *dret = 0; \ + str->l = append? str->l : 0; \ + for (;;) { \ + int i; \ + if (ks_err(ks)) return -3; \ + if (ks->begin >= ks->end) { \ + if (!ks->is_eof) { \ + ks->begin = 0; \ + ks->end = __read(ks->f, ks->buf, __bufsize); \ + if (ks->end == 0) { ks->is_eof = 1; break; } \ + if (ks->end == -1) { ks->is_eof = 1; return -3; } \ + } else break; \ + } \ + if (delimiter == KS_SEP_LINE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == '\n') break; \ + } else if (delimiter > KS_SEP_MAX) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (ks->buf[i] == delimiter) break; \ + } else if (delimiter == KS_SEP_SPACE) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i])) break; \ + } else if (delimiter == KS_SEP_TAB) { \ + for (i = ks->begin; i < ks->end; ++i) \ + if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ + } else i = 0; /* never come to here! */ \ + if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ + str->m = str->l + (i - ks->begin) + 1; \ + kroundup32(str->m); \ + str->s = (char*)realloc(str->s, str->m); \ + } \ + gotany = 1; \ + memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ + str->l = str->l + (i - ks->begin); \ + ks->begin = i + 1; \ + if (i < ks->end) { \ + if (dret) *dret = ks->buf[i]; \ + break; \ + } \ + } \ + if (!gotany && ks_eof(ks)) return -1; \ + if (str->s == 0) { \ + str->m = 1; \ + str->s = (char*)calloc(1, 1); \ + } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ + str->s[str->l] = '\0'; \ + return str->l; \ + } \ + static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ + { return ks_getuntil2(ks, delimiter, str, dret, 0); } + +#define KSTREAM_INIT(type_t, __read, __bufsize) \ + __KS_TYPE(type_t) \ + __KS_BASIC(type_t, __bufsize) \ + __KS_GETC(__read, __bufsize) \ + __KS_GETUNTIL(__read, __bufsize) + +#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) + +#define __KSEQ_BASIC(SCOPE, type_t) \ + SCOPE kseq_t *kseq_init(type_t fd) \ + { \ + kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ + s->f = ks_init(fd); \ + return s; \ + } \ + SCOPE void kseq_destroy(kseq_t *ks) \ + { \ + if (!ks) return; \ + free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ + ks_destroy(ks->f); \ + free(ks); \ + } + +/* Return value: + >=0 length of the sequence (normal) + -1 end-of-file + -2 truncated quality string + -3 error reading stream + */ +#define __KSEQ_READ(SCOPE) \ + SCOPE int kseq_read(kseq_t *seq) \ + { \ + int c,r; \ + kstream_t *ks = seq->f; \ + if (seq->last_char == 0) { /* then jump to the next header line */ \ + while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \ + if (c < 0) return c; /* end of file or error*/ \ + seq->last_char = c; \ + } /* else: the first header char has been read in the previous call */ \ + seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ + if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r; /* normal exit: EOF or error */ \ + if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ + if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ + seq->seq.m = 256; \ + seq->seq.s = (char*)malloc(seq->seq.m); \ + } \ + while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '+' && c != '@') { \ + if (c == '\n') continue; /* skip empty lines */ \ + seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ + ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ + } \ + if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ + if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ + seq->seq.m = seq->seq.l + 2; \ + kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ + } \ + seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ + if (c != '+') return seq->seq.l; /* FASTA */ \ + if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ + seq->qual.m = seq->seq.m; \ + seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ + } \ + while ((c = ks_getc(ks)) >= 0 && c != '\n'); /* skip the rest of '+' line */ \ + if (c == -1) return -2; /* error: no quality string */ \ + while ((c = ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l)); \ + if (c == -3) return -3; /* stream error */ \ + seq->last_char = 0; /* we have not come to the next header line */ \ + if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ + return seq->seq.l; \ + } + +#define __KSEQ_TYPE(type_t) \ + typedef struct { \ + kstring_t name, comment, seq, qual; \ + int last_char; \ + kstream_t *f; \ + } kseq_t; + +#define KSEQ_INIT2(SCOPE, type_t, __read) \ + KSTREAM_INIT(type_t, __read, 16384) \ + __KSEQ_TYPE(type_t) \ + __KSEQ_BASIC(SCOPE, type_t) \ + __KSEQ_READ(SCOPE) + +#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) + +#define KSEQ_DECLARE(type_t) \ + __KS_TYPE(type_t) \ + __KSEQ_TYPE(type_t) \ + extern kseq_t *kseq_init(type_t fd); \ + void kseq_destroy(kseq_t *ks); \ + int kseq_read(kseq_t *seq); + +#endif diff --git a/src/denovo_discovery/candidate_region.cpp b/src/denovo_discovery/candidate_region.cpp index 106b6696..3a5bc632 100644 --- a/src/denovo_discovery/candidate_region.cpp +++ b/src/denovo_discovery/candidate_region.cpp @@ -283,20 +283,17 @@ void load_all_candidate_regions_pileups_from_fastq(const fs::path& reads_filepat { // TODO: we need to read only until the max read id for (auto& id_and_sequence : sequencesBuffer) { - // did we reach the end already? - if (!fh.eof()) { // no - // print some logging - if (id && id % 100000 == 0) - BOOST_LOG_TRIVIAL(info) << id << " reads processed..."; - - // read the read + if (id && id % 100000 == 0) { + BOOST_LOG_TRIVIAL(info) << id << " reads processed..."; + } + try { fh.get_next(); - id_and_sequence = make_pair(id, fh.read); - ++nbOfReads; - ++id; - } else { // yes - break; // we read everything already, exit this loop + } catch (std::out_of_range& err) { + break; } + id_and_sequence = make_pair(id, fh.read); + ++nbOfReads; + ++id; } } diff --git a/src/fastaq_handler.cpp b/src/fastaq_handler.cpp index 30235866..47ba9555 100644 --- a/src/fastaq_handler.cpp +++ b/src/fastaq_handler.cpp @@ -1,176 +1,79 @@ #include -#include #include -//#include -#include -#include -#include #include "fastaq_handler.h" -#include "utils.h" -FastaqHandler::FastaqHandler(const std::string& filepath) - : gzipped(false) - , instream(&inbuf) +FastaqHandler::FastaqHandler(const std::string filepath) + : closed(false) + , filepath(filepath) , num_reads_parsed(0) { - // level for boost logging - // logging::core::get()->set_filter(logging::trivial::severity >= g_log_level); - - BOOST_LOG_TRIVIAL(debug) << "Open fastaq file" << filepath; - fastaq_file.open(filepath); - if (not fastaq_file.is_open()) { - std::cerr << "Unable to open fastaq file " << filepath << std::endl; - std::exit(EXIT_FAILURE); - } - try { - if (filepath.substr(filepath.length() - 2) == "gz") { - inbuf.push(boost::iostreams::gzip_decompressor()); - gzipped = true; - } - inbuf.push(fastaq_file); - } catch (const boost::iostreams::gzip_error& e) { - std::cerr << "Problem transfering file contents to boost stream: " << e.what() - << '\n'; + this->fastaq_file = gzopen(filepath.c_str(), "r"); + if (this->fastaq_file == nullptr) { + throw std::ios_base::failure("Unable to open " + this->filepath); } + this->inbuf = kseq_init(this->fastaq_file); } -FastaqHandler::~FastaqHandler() { close(); } +FastaqHandler::~FastaqHandler() { this->close(); } -bool FastaqHandler::eof() -{ - int c = instream.peek(); - return (c == EOF); -} +bool FastaqHandler::eof() const { return ks_eof(this->inbuf->f); } void FastaqHandler::get_next() { - // cout << "next "; - if (!line.empty() and (line[0] == '>' or line[0] == '@')) { - // cout << "read name line " << num_reads_parsed << " " << line << endl; - name = line.substr(1); - ++num_reads_parsed; - read.clear(); + if (this->eof()) { + throw std::out_of_range("Read requested after the end of file was reached"); } + int read_status = kseq_read(this->inbuf); - while (getline(instream, line).good()) { - if (!line.empty() and line[0] == '+') { - // skip this line and the qual score line - getline(instream, line); - // cout << "qual score line ." << line << "." << endl; - } else if (line.empty() || line[0] == '>' || line[0] == '@') { - if (!read.empty() - or line.empty()) // ok we'll allow reads with no name, removed - { - return; - } - // cout << num_reads_parsed << " " << line << endl; - name = line.substr(1); - ++num_reads_parsed; - read.clear(); - } else { - // cout << "read line ." << line << "." << endl; - read += line; - } + // if not eof but we get -1 here then it was an empty file/read/line + if (read_status == -1) { + throw std::out_of_range("Read requested after the end of file was reached"); } -} - -void FastaqHandler::skip_next() -{ - // cout << "skip "; - if (!line.empty() and (line[0] == '>' or line[0] == '@')) { - ++num_reads_parsed; - } - - while (getline(instream, line).good()) { - if (!line.empty() and line[0] == '+') { - // skip this line and the qual score line - getline(instream, line); - // cout << "qual score line ." << line << "." << endl; - } else if (line.empty() or line[0] == '>' or line[0] == '@') { - return; - } - } -} - -void print(std::ifstream& infile) -{ - char file; - std::vector read; - uint i = 0; - - // Read infile to vector - while (!infile.eof()) { - infile >> file; - read.push_back(file); + if (read_status == -2) { + throw std::runtime_error("Truncated quality string detected"); + } else if (read_status == -3) { + throw std::ios_base::failure("Error reading " + this->filepath); } - // Print read vector - for (i = 0; i < read.size(); i++) { - std::cout << read[i]; - } + ++this->num_reads_parsed; + this->name = this->inbuf->name.s; + this->read = this->inbuf->seq.s; } -void print(std::istream& infile) +void FastaqHandler::get_nth_read(const uint32_t& idx) { - char file; - std::vector read; - int i = 0; - - // Read infile to vector - while (!infile.eof()) { - infile >> file; - read.push_back(file); - } - - // Print read vector - for (auto i = 0; i < read.size(); i++) { - std::cout << read[i]; + // edge case where no reads have been loaded yet + if (this->num_reads_parsed == 0) { + this->get_next(); } -} - -void FastaqHandler::get_id(const uint32_t& id) -{ - const uint32_t one_based_id = id + 1; - if (one_based_id < num_reads_parsed) { - BOOST_LOG_TRIVIAL(warning) - << "restart buffer as have id " << num_reads_parsed << " and want id " - << one_based_id << " (" << id << ") with 0-based indexing."; + const uint32_t one_based_idx = idx + 1; + if (one_based_idx < this->num_reads_parsed) { num_reads_parsed = 0; name.clear(); read.clear(); - line.clear(); - assert(name.empty()); - assert(read.empty()); - assert(line.empty()); - - instream.ignore(std::numeric_limits::max()); - fastaq_file.seekg(0, fastaq_file.beg); - instream.clear(); - inbuf.pop(); - if (gzipped) { - inbuf.pop(); - inbuf.push(boost::iostreams::gzip_decompressor()); - } - inbuf.push(fastaq_file); - } - - while (id > 1 and num_reads_parsed < id) { - skip_next(); - if (eof()) { - break; - } + gzrewind(this->fastaq_file); + kseq_rewind(this->inbuf); } - while (num_reads_parsed <= id) { - get_next(); - if (eof()) { - break; - } + while (this->num_reads_parsed < one_based_idx) { + this->get_next(); } } void FastaqHandler::close() { - BOOST_LOG_TRIVIAL(debug) << "Close fastaq file"; - fastaq_file.close(); + if (!this->is_closed()) { + const auto closed_status = gzclose(this->fastaq_file); + kseq_destroy(this->inbuf); + + if (closed_status != Z_OK) { + std::ostringstream err_msg; + err_msg << "Failed to close " << this->filepath + << ". Got zlib return code: " << closed_status << std::endl; + throw std::ios_base::failure(err_msg.str()); + } + this->closed = true; + } } + +bool FastaqHandler::is_closed() const { return this->closed; } diff --git a/src/get_vcf_ref_main.cpp b/src/get_vcf_ref_main.cpp index 542e7e16..e025816b 100644 --- a/src/get_vcf_ref_main.cpp +++ b/src/get_vcf_ref_main.cpp @@ -38,7 +38,7 @@ int pandora_get_vcf_ref(int argc, char* argv[]) // the "pandora walk" comand for (const auto& prg_ptr : prgs) { found = false; - readfile.get_id(0); + readfile.get_nth_read(0); while (not readfile.eof()) { npath = prg_ptr->get_valid_vcf_reference(readfile.read); if (not npath.empty()) { @@ -48,7 +48,11 @@ int pandora_get_vcf_ref(int argc, char* argv[]) // the "pandora walk" comand found = true; break; } - readfile.get_next(); + try { + readfile.get_next(); + } catch (std::out_of_range& err) { + break; + } } if (!found) { diff --git a/src/pangenome/pangraph.cpp b/src/pangenome/pangraph.cpp index b448951e..9f668f83 100644 --- a/src/pangenome/pangraph.cpp +++ b/src/pangenome/pangraph.cpp @@ -545,7 +545,7 @@ void pangenome::Graph::save_mapped_read_strings( outhandle.open(outdir + "/" + node_ptr.second->get_name() + "/" + node_ptr.second->get_name() + ".reads.fa"); for (const auto& coord : read_overlap_coordinates) { - readfile.get_id(coord[0]); + readfile.get_nth_read(coord[0]); start = (uint32_t)std::max((int32_t)coord[1] - buff, 0); end = std::min(coord[2] + (uint32_t)buff, (uint32_t)readfile.read.length()); outhandle << ">" << readfile.name << " pandora: " << coord[0] << " " diff --git a/src/utils.cpp b/src/utils.cpp index c468aeb3..190b9f37 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -120,7 +120,11 @@ void read_prg_file(std::vector>& prgs, FastaqHandler fh(filepath); while (!fh.eof()) { - fh.get_next(); + try { + fh.get_next(); + } catch (std::out_of_range& err) { + break; + } if (fh.name.empty() or fh.read.empty()) continue; auto s = std::make_shared(LocalPRG(id, fh.name, @@ -171,7 +175,11 @@ void load_vcf_refs_file(const std::string& filepath, VCFRefs& vcf_refs) FastaqHandler fh(filepath); while (!fh.eof()) { - fh.get_next(); + try { + fh.get_next(); + } catch (std::out_of_range& err) { + break; + } if (!fh.name.empty() && !fh.read.empty()) { vcf_refs[fh.name] = fh.read; } @@ -473,20 +481,17 @@ uint32_t pangraph_from_read_file(const std::string& filepath, #pragma omp critical(ReadFileMutex) { for (auto& sequence : sequencesBuffer) { - // did we reach the end already? - if (!fh.eof()) { // no - // print some logging - if (id && id % 100000 == 0) - BOOST_LOG_TRIVIAL(info) << id << " reads processed..."; - - // read the read + if (id && id % 100000 == 0) { + BOOST_LOG_TRIVIAL(info) << id << " reads processed..."; + } + try { fh.get_next(); - sequence.initialize(id, fh.name, fh.read, w, k); - ++nbOfReads; - ++id; - } else { // yes - break; // we read everything already, exit this loop + } catch (std::out_of_range& err) { + break; } + sequence.initialize(id, fh.name, fh.read, w, k); + ++nbOfReads; + ++id; } } diff --git a/src/walk_main.cpp b/src/walk_main.cpp index 8ca520ff..e06f9362 100644 --- a/src/walk_main.cpp +++ b/src/walk_main.cpp @@ -49,9 +49,12 @@ int pandora_walk(int argc, char* argv[]) // the "pandora walk" comand std::string read_string; FastaqHandler readfile(argv[2]); while (not readfile.eof()) { - readfile.get_next(); - // cout << "Try to find gene " << readfile.num_reads_parsed << " " << - // readfile.name << endl << readfile.read << endl; + try { + readfile.get_next(); + } catch (std::out_of_range& err) { + break; + } + for (const auto& prg_ptr : prgs) { npath = prg_ptr->prg.nodes_along_string(readfile.read); if (not npath.empty()) { diff --git a/test/estimate_parameters_test.cpp b/test/estimate_parameters_test.cpp index 41b0d639..57235846 100644 --- a/test/estimate_parameters_test.cpp +++ b/test/estimate_parameters_test.cpp @@ -7,6 +7,8 @@ using namespace std; +const std::string TEST_CASE_DIR = "../../test/test_cases/"; + TEST(EstimateParameters_FitMeanCovg, Empty) { std::vector v = {}; @@ -204,9 +206,9 @@ TEST(EstimateParameters_EstimateParameters, PangraphWithNodes_SimpleBinomial) bool bin = true, illumina = true; auto pangraph = std::make_shared(pangenome::Graph()); - pangraph_from_read_file("../../test/test_cases/estimate_parameters_reads.fa", - pangraph, index, prgs, w, k, 1, e_rate, min_cluster_size, genome_size, - illumina); + const auto filepath = TEST_CASE_DIR + "estimate_parameters_reads.fa"; + pangraph_from_read_file(filepath, pangraph, index, prgs, w, k, 1, e_rate, + min_cluster_size, genome_size, illumina); pangraph->add_hits_to_kmergraphs(prgs); auto expected_depth_covg @@ -228,9 +230,9 @@ TEST( bool bin = true, illumina = true; auto pangraph = std::make_shared(pangenome::Graph()); - pangraph_from_read_file("../../test/test_cases/estimate_parameters_reads3.fa", - pangraph, index, prgs, w, k, 1, e_rate, min_cluster_size, genome_size, - illumina); + const auto filepath = TEST_CASE_DIR + "estimate_parameters_reads3.fa"; + pangraph_from_read_file(filepath, pangraph, index, prgs, w, k, 1, e_rate, + min_cluster_size, genome_size, illumina); pangraph->add_hits_to_kmergraphs(prgs); auto expected_depth_covg @@ -251,9 +253,9 @@ TEST(EstimateParameters_EstimateParameters, PangraphWithNodes_SimpleNegativeBino bool bin = false, illumina = true; auto pangraph = std::make_shared(pangenome::Graph()); - pangraph_from_read_file("../../test/test_cases/estimate_parameters_reads.fa", - pangraph, index, prgs, w, k, 1, e_rate, min_cluster_size, genome_size, - illumina); + const auto filepath = TEST_CASE_DIR + "estimate_parameters_reads.fa"; + pangraph_from_read_file(filepath, pangraph, index, prgs, w, k, 1, e_rate, + min_cluster_size, genome_size, illumina); pangraph->add_hits_to_kmergraphs(prgs); auto expected_depth_covg @@ -275,9 +277,9 @@ TEST(EstimateParameters_EstimateParameters, bool bin = false, illumina = true; auto pangraph = std::make_shared(pangenome::Graph()); - pangraph_from_read_file("../../test/test_cases/estimate_parameters_reads4.fa", - pangraph, index, prgs, w, k, 1, e_rate, min_cluster_size, genome_size, - illumina); + const auto filepath = TEST_CASE_DIR + "estimate_parameters_reads4.fa"; + pangraph_from_read_file(filepath, pangraph, index, prgs, w, k, 1, e_rate, + min_cluster_size, genome_size, illumina); pangraph->add_hits_to_kmergraphs(prgs); auto expected_depth_covg @@ -300,9 +302,9 @@ TEST(EstimateParameters_EstimateParameters, bool bin = true, illumina = true; auto pangraph = std::make_shared(pangenome::Graph()); - pangraph_from_read_file("../../test/test_cases/estimate_parameters_reads4.fa", - pangraph, index, prgs, w, k, 1, e_rate, min_cluster_size, genome_size, - illumina); + const auto filepath = TEST_CASE_DIR + "estimate_parameters_reads4.fa"; + pangraph_from_read_file(filepath, pangraph, index, prgs, w, k, 1, e_rate, + min_cluster_size, genome_size, illumina); pangraph->add_hits_to_kmergraphs(prgs); auto expected_depth_covg @@ -324,9 +326,9 @@ TEST(EstimateParameters_EstimateParameters, PangraphWithNodes_NoiseReads) bool bin = false, illumina = true; auto pangraph = std::make_shared(pangenome::Graph()); - pangraph_from_read_file("../../test/test_cases/estimate_parameters_reads2.fa", - pangraph, index, prgs, w, k, 1, e_rate, min_cluster_size, genome_size, - illumina); + const auto filepath = TEST_CASE_DIR + "estimate_parameters_reads2.fa"; + pangraph_from_read_file(filepath, pangraph, index, prgs, w, k, 1, e_rate, + min_cluster_size, genome_size, illumina); pangraph->add_hits_to_kmergraphs(prgs); auto expected_depth_covg diff --git a/test/fastaq_handler_test.cpp b/test/fastaq_handler_test.cpp index 745aa87c..7f17cae7 100644 --- a/test/fastaq_handler_test.cpp +++ b/test/fastaq_handler_test.cpp @@ -6,57 +6,45 @@ using namespace std; +const std::string TEST_CASE_DIR = "../../test/test_cases/"; + +TEST(FastaqHandlerTest, non_existant_file_throws_exception) +{ + EXPECT_THROW(FastaqHandler fh("fake.file"), std::ios_base::failure); +} + TEST(FastaqHandlerTest, create_fa) { - FastaqHandler fh("../../test/test_cases/reads.fa"); + FastaqHandler fh(TEST_CASE_DIR + "reads.fa"); EXPECT_EQ((uint32_t)0, fh.num_reads_parsed); - EXPECT_TRUE(fh.fastaq_file.is_open()); + + EXPECT_TRUE(fh.fastaq_file); } TEST(FastaqHandlerTest, create_fq) { - FastaqHandler fh("../../test/test_cases/reads.fq"); + FastaqHandler fh(TEST_CASE_DIR + "reads.fq"); EXPECT_EQ((uint)0, fh.num_reads_parsed); - EXPECT_TRUE(fh.fastaq_file.is_open()); + EXPECT_TRUE(fh.fastaq_file); } TEST(FastaqHandlerTest, create_fagz) { - FastaqHandler fh("../../test/test_cases/reads.fa.gz"); + FastaqHandler fh(TEST_CASE_DIR + "reads.fa.gz"); EXPECT_EQ((uint)0, fh.num_reads_parsed); - EXPECT_TRUE(fh.fastaq_file.is_open()); + EXPECT_TRUE(fh.fastaq_file); } TEST(FastaqHandlerTest, create_fqgz) { - FastaqHandler fh("../../test/test_cases/reads.fq.gz"); + FastaqHandler fh(TEST_CASE_DIR + "reads.fq.gz"); EXPECT_EQ((uint)0, fh.num_reads_parsed); - EXPECT_TRUE(fh.fastaq_file.is_open()); -} - -TEST(FastaqHandlerTest, getline_fa) -{ - FastaqHandler fh("../../test/test_cases/reads.fa"); - bool foundline = false; - while (std::getline(fh.instream, fh.line)) { - foundline = true; - } - EXPECT_TRUE(foundline); -} - -TEST(FastaqHandlerTest, getline_fagz) -{ - FastaqHandler fh("../../test/test_cases/reads.fa.gz"); - bool foundline = false; - while (std::getline(fh.instream, fh.line)) { - foundline = true; - } - EXPECT_TRUE(foundline); + EXPECT_TRUE(fh.fastaq_file); } TEST(FastaqHandlerTest, get_next) { - FastaqHandler fh("../../test/test_cases/reads.fa"); + FastaqHandler fh(TEST_CASE_DIR + "reads.fa"); fh.get_next(); EXPECT_EQ((uint32_t)1, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read0"); @@ -82,175 +70,201 @@ TEST(FastaqHandlerTest, get_next) EXPECT_EQ(fh.name, "read4"); EXPECT_EQ(fh.read, "another junk line"); + EXPECT_THROW(fh.get_next(), std::out_of_range); +} + +TEST(FastaqHandlerTest, eof) +{ + FastaqHandler fh(TEST_CASE_DIR + "reads.fa"); + EXPECT_FALSE(fh.eof()); fh.get_next(); - EXPECT_EQ((uint)5, fh.num_reads_parsed); - EXPECT_EQ(fh.name, "read4"); - EXPECT_EQ(fh.read, "another junk line"); + EXPECT_FALSE(fh.eof()); + fh.get_next(); + EXPECT_FALSE(fh.eof()); + fh.get_next(); + EXPECT_FALSE(fh.eof()); + fh.get_next(); + EXPECT_FALSE(fh.eof()); + fh.get_next(); + EXPECT_TRUE(fh.eof()); } -TEST(FastaqHandlerTest, get_id_fa) +TEST(FastaqHandlerTest, get_nth_read_fa) { - FastaqHandler fh("../../test/test_cases/reads.fa"); + FastaqHandler fh(TEST_CASE_DIR + "reads.fa"); - fh.get_id(1); + fh.get_nth_read(1); EXPECT_EQ((uint32_t)2, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read1"); EXPECT_EQ(fh.read, "should copy the phrase *should*"); - fh.get_id(0); + fh.get_nth_read(0); EXPECT_EQ((uint32_t)1, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read0"); EXPECT_EQ(fh.read, "to be ignored"); - fh.get_id(2); + fh.get_nth_read(2); EXPECT_EQ((uint32_t)3, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read2"); EXPECT_EQ(fh.read, "this time we should get *is time *"); - fh.get_id(1); + fh.get_nth_read(1); EXPECT_EQ((uint32_t)2, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read1"); EXPECT_EQ(fh.read, "should copy the phrase *should*"); - fh.get_id(0); + fh.get_nth_read(0); EXPECT_EQ((uint32_t)1, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read0"); EXPECT_EQ(fh.read, "to be ignored"); - fh.get_id(1); + fh.get_nth_read(1); EXPECT_EQ((uint32_t)2, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read1"); EXPECT_EQ(fh.read, "should copy the phrase *should*"); - fh.get_id(2); + fh.get_nth_read(2); EXPECT_EQ((uint32_t)3, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read2"); EXPECT_EQ(fh.read, "this time we should get *is time *"); } -TEST(FastaqHandlerTest, get_id_fq) +TEST(FastaqHandlerTest, get_nth_read_fq) { - FastaqHandler fh("../../test/test_cases/reads.fq"); - - fh.get_id(1); - EXPECT_EQ((uint32_t)2, fh.num_reads_parsed); - EXPECT_EQ(fh.name, "read1"); - EXPECT_EQ(fh.read, "should copy the phrase *should*"); - - fh.get_id(0); - EXPECT_EQ((uint32_t)1, fh.num_reads_parsed); - EXPECT_EQ(fh.name, "read0"); - EXPECT_EQ(fh.read, "to be ignored"); - - fh.get_id(2); - EXPECT_EQ((uint32_t)3, fh.num_reads_parsed); - EXPECT_EQ(fh.name, "read2"); - EXPECT_EQ(fh.read, "this time we should get *is time *"); - - fh.get_id(1); - EXPECT_EQ((uint32_t)2, fh.num_reads_parsed); - EXPECT_EQ(fh.name, "read1"); - EXPECT_EQ(fh.read, "should copy the phrase *should*"); + const std::string filepath = std::tmpnam(nullptr); + { + std::ofstream outstream(filepath); + outstream << "@read1 comment\nACGT\n+\n^^^^\n"; + outstream << "@read2 comment\nGCGT\n+\n^^^^\n"; + outstream << "@read3 comment\nTCGT\n+\n^^^^\n"; + } - fh.get_id(0); - EXPECT_EQ((uint32_t)1, fh.num_reads_parsed); - EXPECT_EQ(fh.name, "read0"); - EXPECT_EQ(fh.read, "to be ignored"); + FastaqHandler fh(filepath); + fh.get_nth_read(1); + uint32_t expected_num_parsed = 2; + std::string expected_name = "read2"; + std::string expected_read = "GCGT"; + EXPECT_EQ(expected_num_parsed, fh.num_reads_parsed); + EXPECT_EQ(expected_name, fh.name); + EXPECT_EQ(expected_read, fh.read); + + fh.get_nth_read(0); + expected_num_parsed = 1; + expected_name = "read1"; + expected_read = "ACGT"; + EXPECT_EQ(expected_num_parsed, fh.num_reads_parsed); + EXPECT_EQ(expected_name, fh.name); + EXPECT_EQ(expected_read, fh.read); + + fh.get_nth_read(2); + expected_num_parsed = 3; + expected_name = "read3"; + expected_read = "TCGT"; + EXPECT_EQ(expected_num_parsed, fh.num_reads_parsed); + EXPECT_EQ(expected_name, fh.name); + EXPECT_EQ(expected_read, fh.read); +} - fh.get_id(1); - EXPECT_EQ((uint32_t)2, fh.num_reads_parsed); - EXPECT_EQ(fh.name, "read1"); - EXPECT_EQ(fh.read, "should copy the phrase *should*"); +TEST(FastaqHandlerTest, get_nth_read_past_end_throws_error) +{ + const std::string filepath = std::tmpnam(nullptr); + { + std::ofstream outstream(filepath); + outstream << "@read1 comment\nACGT\n+\n^^^^\n"; + outstream << "@read2 comment\nGCGT\n+\n^^^^\n"; + } - fh.get_id(2); - EXPECT_EQ((uint32_t)3, fh.num_reads_parsed); - EXPECT_EQ(fh.name, "read2"); - EXPECT_EQ(fh.read, "this time we should get *is time *"); + FastaqHandler fh(filepath); + EXPECT_THROW(fh.get_nth_read(10), std::out_of_range); +} - fh.get_id(4); - EXPECT_EQ((uint)5, fh.num_reads_parsed); - EXPECT_EQ(fh.name, "read4"); +TEST(FastaqHandlerTest, truncated_quality_string_throws_exception) +{ + const std::string filepath = std::tmpnam(nullptr); + { + std::ofstream outstream(filepath); + outstream << "@read1 comment\nACGT\n+\n^^^\n"; + } - fh.get_id(3); - EXPECT_EQ((uint)4, fh.num_reads_parsed); - EXPECT_EQ(fh.name, "read3"); + FastaqHandler fh(filepath); + EXPECT_THROW(fh.get_next(), std::runtime_error); } -TEST(FastaqHandlerTest, get_id_fagz) +TEST(FastaqHandlerTest, get_nth_read_fagz) { - FastaqHandler fh("../../test/test_cases/reads.fa.gz"); + FastaqHandler fh(TEST_CASE_DIR + "reads.fa.gz"); - fh.get_id(1); + fh.get_nth_read(1); EXPECT_EQ((uint)2, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read1"); EXPECT_EQ(fh.read, "should copy the phrase *should*"); - fh.get_id(0); + fh.get_nth_read(0); EXPECT_EQ((uint)1, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read0"); EXPECT_EQ(fh.read, "to be ignored"); - fh.get_id(2); + fh.get_nth_read(2); EXPECT_EQ((uint)3, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read2"); EXPECT_EQ(fh.read, "this time we should get *is time *"); - fh.get_id(1); + fh.get_nth_read(1); EXPECT_EQ((uint)2, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read1"); EXPECT_EQ(fh.read, "should copy the phrase *should*"); - fh.get_id(0); + fh.get_nth_read(0); EXPECT_EQ((uint)1, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read0"); EXPECT_EQ(fh.read, "to be ignored"); - fh.get_id(1); + fh.get_nth_read(1); EXPECT_EQ((uint)2, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read1"); EXPECT_EQ(fh.read, "should copy the phrase *should*"); - fh.get_id(2); + fh.get_nth_read(2); EXPECT_EQ((uint)3, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read2"); EXPECT_EQ(fh.read, "this time we should get *is time *"); } -TEST(FastaqHandlerTest, get_id_fqgz) +TEST(FastaqHandlerTest, get_nth_read_fqgz) { - FastaqHandler fh("../../test/test_cases/reads.fq.gz"); + FastaqHandler fh(TEST_CASE_DIR + "reads.fq.gz"); - fh.get_id(1); + fh.get_nth_read(1); EXPECT_EQ((uint)2, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read1"); EXPECT_EQ(fh.read, "should copy the phrase *should*"); - fh.get_id(0); + fh.get_nth_read(0); EXPECT_EQ((uint)1, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read0"); EXPECT_EQ(fh.read, "to be ignored"); - fh.get_id(2); + fh.get_nth_read(2); EXPECT_EQ((uint)3, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read2"); EXPECT_EQ(fh.read, "this time we should get *is time *"); - fh.get_id(1); + fh.get_nth_read(1); EXPECT_EQ((uint)2, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read1"); EXPECT_EQ(fh.read, "should copy the phrase *should*"); - fh.get_id(0); + fh.get_nth_read(0); EXPECT_EQ((uint)1, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read0"); EXPECT_EQ(fh.read, "to be ignored"); - fh.get_id(1); + fh.get_nth_read(1); EXPECT_EQ((uint)2, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read1"); EXPECT_EQ(fh.read, "should copy the phrase *should*"); - fh.get_id(2); + fh.get_nth_read(2); EXPECT_EQ((uint)3, fh.num_reads_parsed); EXPECT_EQ(fh.name, "read2"); EXPECT_EQ(fh.read, "this time we should get *is time *"); @@ -258,18 +272,48 @@ TEST(FastaqHandlerTest, get_id_fqgz) TEST(FastaqHandlerTest, close) { - FastaqHandler fh("../../test/test_cases/reads.fa"); + FastaqHandler fh(TEST_CASE_DIR + "reads.fa"); + EXPECT_EQ((uint32_t)0, fh.num_reads_parsed); + EXPECT_TRUE(fh.fastaq_file); + fh.close(); + EXPECT_TRUE(fh.is_closed()); +} + +TEST(FastaqHandlerTest, close_multiple_times_does_not_error) +{ + FastaqHandler fh(TEST_CASE_DIR + "reads.fa"); EXPECT_EQ((uint32_t)0, fh.num_reads_parsed); - EXPECT_TRUE(fh.fastaq_file.is_open()); + EXPECT_TRUE(fh.fastaq_file); + fh.close(); + EXPECT_TRUE(fh.is_closed()); + fh.close(); fh.close(); - EXPECT_FALSE(fh.fastaq_file.is_open()); } TEST(FastaqHandlerTest, close_fqgz) { - FastaqHandler fh("../../test/test_cases/reads.fq.gz"); + FastaqHandler fh(TEST_CASE_DIR + "reads.fq.gz"); EXPECT_EQ((uint)0, fh.num_reads_parsed); - EXPECT_TRUE(fh.fastaq_file.is_open()); + EXPECT_FALSE(fh.is_closed()); fh.close(); - EXPECT_FALSE(fh.fastaq_file.is_open()); + EXPECT_TRUE(fh.is_closed()); +} + +TEST(FastaqHandlerTest, filepath_test) +{ + FastaqHandler fh(TEST_CASE_DIR + "reads.fq.gz"); + EXPECT_EQ(TEST_CASE_DIR + "reads.fq.gz", fh.filepath); } + +TEST(FastaqHandlerTest, get_next_on_empty_file_throws_exception) +{ + const std::string filepath = std::tmpnam(nullptr); + { + std::ofstream outstream(filepath); + outstream << ""; + } + + FastaqHandler fh(filepath); + EXPECT_FALSE(fh.eof()); + EXPECT_THROW(fh.get_next(), std::out_of_range); +} \ No newline at end of file diff --git a/test/index_test.cpp b/test/index_test.cpp index bfcac80e..d11713c7 100644 --- a/test/index_test.cpp +++ b/test/index_test.cpp @@ -12,6 +12,8 @@ using namespace std; +const std::string TEST_CASE_DIR = "../../test/test_cases/"; + TEST(IndexTest, add_record) { Index idx; @@ -152,33 +154,33 @@ TEST(IndexTest, merging_indexes) uint32_t w = 2, k = 3; std::vector> prgs; auto index = std::make_shared(); - auto outdir = "../../test/test_cases/kgs/"; + auto outdir = TEST_CASE_DIR + "kgs/"; - read_prg_file(prgs, "../../test/test_cases/prg1.fa", 1); + read_prg_file(prgs, TEST_CASE_DIR + "prg1.fa", 1); index_prgs(prgs, index, w, k, outdir); - index->save("../../test/test_cases/prg1.fa.idx"); + index->save(TEST_CASE_DIR + "prg1.fa.idx"); prgs.clear(); index->clear(); - read_prg_file(prgs, "../../test/test_cases/prg2.fa", 2); + read_prg_file(prgs, TEST_CASE_DIR + "prg2.fa", 2); index_prgs(prgs, index, w, k, outdir); - index->save("../../test/test_cases/prg2.fa.idx"); + index->save(TEST_CASE_DIR + "prg2.fa.idx"); prgs.clear(); index->clear(); - read_prg_file(prgs, "../../test/test_cases/prg3.fa", 3); + read_prg_file(prgs, TEST_CASE_DIR + "prg3.fa", 3); index_prgs(prgs, index, w, k, outdir); - index->save("../../test/test_cases/prg3.fa.idx"); + index->save(TEST_CASE_DIR + "prg3.fa.idx"); // merge auto index_merged = std::make_shared(); - index_merged->load("../../test/test_cases/prg1.fa.idx"); - index_merged->load("../../test/test_cases/prg2.fa.idx"); - index_merged->load("../../test/test_cases/prg3.fa.idx"); + index_merged->load(TEST_CASE_DIR + "prg1.fa.idx"); + index_merged->load(TEST_CASE_DIR + "prg2.fa.idx"); + index_merged->load(TEST_CASE_DIR + "prg3.fa.idx"); // now an index from all 4 in prgs.clear(); auto index_all = std::make_shared(); - read_prg_file(prgs, "../../test/test_cases/prg0123.fa"); + read_prg_file(prgs, TEST_CASE_DIR + "prg0123.fa"); index_prgs(prgs, index_all, w, k, outdir); } diff --git a/test/localPRG_test.cpp b/test/localPRG_test.cpp index d55023a9..66ae4f84 100644 --- a/test/localPRG_test.cpp +++ b/test/localPRG_test.cpp @@ -22,6 +22,8 @@ using namespace std; +const std::string TEST_CASE_DIR = "../../test/test_cases/"; + TEST(LocalPRGTest, create) { LocalPRG l0(0, "empty", ""); @@ -1494,7 +1496,7 @@ TEST(LocalPRGTest, moreupdateVCF) { // load PRGs from file std::vector> prgs; - read_prg_file(prgs, "../../test/test_cases/updatevcf_test.fa"); + read_prg_file(prgs, TEST_CASE_DIR + "updatevcf_test.fa"); EXPECT_EQ((uint)3, prgs.size()); @@ -1616,7 +1618,7 @@ class LocalPRGTest___get_number_of_bases_in_local_path_before_a_given_position__ std::make_shared("", Interval(35, 45), 1)); } - void TearDown() override {} + void TearDown() override { } LocalPRGMockExposesTestedMethod local_prg_mock; std::vector empty_local_path; @@ -1800,7 +1802,7 @@ class kmer_node_from_40_to_50 = std::make_shared(2, path_from_40_to_50); } - void TearDown() override {} + void TearDown() override { } LocalPRGMockExposesTestedMethod local_prg_mock; KmerNodePtr kmer_node_from_3_to_30; diff --git a/test/pangraph_test.cpp b/test/pangraph_test.cpp index 29c6bd23..112c7410 100644 --- a/test/pangraph_test.cpp +++ b/test/pangraph_test.cpp @@ -16,6 +16,8 @@ using namespace pangenome; +const std::string TEST_CASE_DIR = "../../test/test_cases/"; + TEST(PangenomeGraphConstructor, constructors_and_get_sample) { { @@ -1094,7 +1096,7 @@ TEST(PangenomeGraphTest, split_node_by_reads) EXPECT_EQ(pg1.nodes[7]->covg, (uint)0); } -TEST(PangenomeGraphTest, add_hits_to_kmergraph) {} +TEST(PangenomeGraphTest, add_hits_to_kmergraph) { } TEST(PangenomeGraphTest, save_matrix) { @@ -1117,7 +1119,7 @@ TEST(PangenomeGraphTest, save_matrix) pg.add_hits_between_PRG_and_sample(1, "sample1", kmp); pg.add_hits_between_PRG_and_sample(2, "sample3", kmp); - pg.save_matrix("../../test/test_cases/pangraph_test_save.matrix", names); + pg.save_matrix(TEST_CASE_DIR + "pangraph_test_save.matrix", names); } TEST(PangenomeGraphTest, save_mapped_read_strings) @@ -1172,14 +1174,13 @@ TEST(PangenomeGraphTest, save_mapped_read_strings) std::string expected2 = ">read2 pandora: 2 2:10 - \nis time \n>read1 pandora: 1 0:6 + \nshould\n"; - pg.save_mapped_read_strings( - "../../test/test_cases/reads.fa", "save_mapped_read_strings"); + pg.save_mapped_read_strings(TEST_CASE_DIR + "reads.fa", "save_mapped_read_strings"); std::ifstream ifs("save_mapped_read_strings/zero/zero.reads.fa"); std::string content( (std::istreambuf_iterator(ifs)), (std::istreambuf_iterator())); EXPECT_TRUE((content == expected1) or (content == expected2)); - pg.save_mapped_read_strings("../../test/test_cases/reads.fa", "."); + pg.save_mapped_read_strings(TEST_CASE_DIR + "reads.fa", "."); std::ifstream ifs2("zero/zero.reads.fa"); std::string content2( (std::istreambuf_iterator(ifs2)), (std::istreambuf_iterator())); diff --git a/test/test_cases/reads.fq b/test/test_cases/reads.fq index a161eb6a..cb114c01 100644 --- a/test/test_cases/reads.fq +++ b/test/test_cases/reads.fq @@ -1,21 +1,21 @@ ->read0 +@read0 to be ignored + -£$£$%£$%£$%£ ->read1 +!!!!!!!!!!!!! +@read1 should copy the phrase *should* + -@$£$%^%^^&&*(%^$$^@!£!!$£$%Y^YGB$%YW ->read2 +@@#!#@#!@@!#!@##!@!@#!@#!@#@#@@ +@read2 this time we should get *is time * + -$%$&%^I&*(*(*%^&$$%@£$@£$@%$&%^&% ->read3 +!!!!!!!^!^^!^!^!^!^!^!^!^!^!^!^!^! +@read3 just in case having more is a problem + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ->read4 +@read4 and another + -%%%%%%%%%% +%%%%%%%%%%% diff --git a/test/test_cases/reads.fq.gz b/test/test_cases/reads.fq.gz index d576f3ea..4fec8d17 100644 Binary files a/test/test_cases/reads.fq.gz and b/test/test_cases/reads.fq.gz differ diff --git a/test/utils_test.cpp b/test/utils_test.cpp index 5ca93894..4b86fbcf 100644 --- a/test/utils_test.cpp +++ b/test/utils_test.cpp @@ -17,6 +17,8 @@ using namespace std; +const std::string TEST_CASE_DIR = "../../test/test_cases/"; + TEST(UtilsTest, split) { vector v = { "abc", "def", "ghi" }; @@ -49,12 +51,12 @@ TEST(UtilsTest, readPrgFile) // simple case first, single prg with empty string sequence // doesn't get added to prgs - read_prg_file(prgs, "../../test/test_cases/prg0.fa"); + read_prg_file(prgs, TEST_CASE_DIR + "prg0.fa"); uint32_t j = 0; EXPECT_EQ(prgs.size(), j); // single prg with simple sequence - read_prg_file(prgs, "../../test/test_cases/prg1.fa"); + read_prg_file(prgs, TEST_CASE_DIR + "prg1.fa"); LocalPRG l1(1, "prg1", "AGCT"); j = 1; EXPECT_EQ(prgs.size(), j); @@ -65,7 +67,7 @@ TEST(UtilsTest, readPrgFile) EXPECT_EQ(prgs[0]->prg, l1.prg); // single prg with a variant site - read_prg_file(prgs, "../../test/test_cases/prg2.fa"); + read_prg_file(prgs, TEST_CASE_DIR + "prg2.fa"); LocalPRG l2(2, "prg2", "A 5 GC 6 G 5 T"); j = 2; EXPECT_EQ(prgs.size(), j); @@ -76,7 +78,7 @@ TEST(UtilsTest, readPrgFile) EXPECT_EQ(prgs[1]->prg, l2.prg); // single prg with a nested variant site - read_prg_file(prgs, "../../test/test_cases/prg3.fa"); + read_prg_file(prgs, TEST_CASE_DIR + "prg3.fa"); LocalPRG l3 = LocalPRG(3, "prg3", "A 5 G 7 C 8 T 7 6 G 5 T"); j = 3; EXPECT_EQ(prgs.size(), j); @@ -89,7 +91,7 @@ TEST(UtilsTest, readPrgFile) // now a prg input file with all 4 in prgs.clear(); EXPECT_EQ(prgs.size(), j); - read_prg_file(prgs, "../../test/test_cases/prg0123.fa"); + read_prg_file(prgs, TEST_CASE_DIR + "prg0123.fa"); j = 3; EXPECT_EQ(prgs.size(), j); } @@ -100,11 +102,11 @@ TEST(UtilsTest, readPrgFile_with_offset) // simple case first, single prg with empty string sequence // doesn't get added to prgs - read_prg_file(prgs, "../../test/test_cases/prg0.fa", 1); + read_prg_file(prgs, TEST_CASE_DIR + "prg0.fa", 1); EXPECT_EQ(prgs.size(), (uint)0); // single prg with simple sequence - read_prg_file(prgs, "../../test/test_cases/prg1.fa", 1); + read_prg_file(prgs, TEST_CASE_DIR + "prg1.fa", 1); LocalPRG l1(1, "prg1", "AGCT"); EXPECT_EQ(prgs.size(), (uint)1); EXPECT_EQ(prgs[0]->id, (uint)1); @@ -113,7 +115,7 @@ TEST(UtilsTest, readPrgFile_with_offset) EXPECT_EQ(prgs[0]->prg, l1.prg); // single prg with a variant site - read_prg_file(prgs, "../../test/test_cases/prg2.fa", 3); + read_prg_file(prgs, TEST_CASE_DIR + "prg2.fa", 3); LocalPRG l2(2, "prg2", "A 5 GC 6 G 5 T"); EXPECT_EQ(prgs.size(), (uint)2); EXPECT_EQ(prgs[1]->id, (uint)3); @@ -122,7 +124,7 @@ TEST(UtilsTest, readPrgFile_with_offset) EXPECT_EQ(prgs[1]->prg, l2.prg); // single prg with a nested variant site - read_prg_file(prgs, "../../test/test_cases/prg3.fa", 40); + read_prg_file(prgs, TEST_CASE_DIR + "prg3.fa", 40); LocalPRG l3 = LocalPRG(3, "prg3", "A 5 G 7 C 8 T 7 6 G 5 T"); EXPECT_EQ(prgs.size(), (uint)3); EXPECT_EQ(prgs[2]->id, (uint)40); @@ -133,7 +135,7 @@ TEST(UtilsTest, readPrgFile_with_offset) // now a prg input file with all 4 in prgs.clear(); EXPECT_EQ(prgs.size(), (uint)0); - read_prg_file(prgs, "../../test/test_cases/prg0123.fa", 6); + read_prg_file(prgs, TEST_CASE_DIR + "prg0123.fa", 6); EXPECT_EQ(prgs.size(), (uint)3); EXPECT_EQ(prgs[0]->id, (uint)6); EXPECT_EQ(prgs[1]->id, (uint)7); @@ -747,7 +749,7 @@ TEST(UtilsTest, biggerInferLocalPRGOrderForRead) pg = new pangenome::Graph(); Index *index; std::vector> prgs; - pangraph_from_read_file("../../test/test_cases/reads.fq.gz", mhs, pg, index, prgs, + pangraph_from_read_file(TEST_CASE_DIR + "reads.fq.gz", mhs, pg, index, prgs, 1, 3, 1, 0.1); delete mhs; delete pg; }*/ @@ -968,7 +970,7 @@ TEST(UtilsTest, pangraphFromReadFile_Fa) auto pangraph = std::make_shared(pangenome::Graph()); pangraph_from_read_file( - "../../test/test_cases/read2.fa", pangraph, index, prgs, 1, 3, 1, 0.1, 1); + TEST_CASE_DIR + "read2.fa", pangraph, index, prgs, 1, 3, 1, 0.1, 1); // create a pangraph object representing the truth we expect (prg 3 4 2 1) // note that prgs 1, 3, 4 share no 3mer, but 2 shares a 3mer with each of 2 other @@ -993,7 +995,7 @@ TEST(UtilsTest, pangraphFromReadFile_Fq) auto pangraph = std::make_shared(pangenome::Graph()); pangraph_from_read_file( - "../../test/test_cases/read2.fq", pangraph, index, prgs, 1, 3, 1, 0.1, 1); + TEST_CASE_DIR + "read2.fq", pangraph, index, prgs, 1, 3, 1, 0.1, 1); // create a pangraph object representing the truth we expect (prg 3 4 2 1) // note that prgs 1, 3, 4 share no 3mer, but 2 shares a 3mer with each of 2 other diff --git a/test/vcf_test.cpp b/test/vcf_test.cpp index 741a1db5..e6e3c8da 100644 --- a/test/vcf_test.cpp +++ b/test/vcf_test.cpp @@ -18,6 +18,8 @@ using ::testing::Property; using ::testing::Return; using ::testing::ReturnRef; +const std::string TEST_CASE_DIR = "../../test/test_cases/"; + TEST(VCFTest, add_record_with_values) { @@ -752,7 +754,7 @@ class VCFTest___genotype___Fixture : public ::testing::Test { snps_only_vcf.records.push_back(snp_vcf_record_ptr); } - void TearDown() override {} + void TearDown() override { } VCFMock default_vcf; VCFMock snps_only_vcf; @@ -1215,9 +1217,9 @@ class VCFTest___merge_multi_allelic_core___Fixture : public ::testing::Test { { } - void SetUp() override {} + void SetUp() override { } - void TearDown() override {} + void TearDown() override { } VCFMock vcf; VCFMock merged_vcf; @@ -1688,7 +1690,7 @@ class VCFTest___make_gt_compatible___Fixture : public ::testing::Test { vcf.records.push_back(vcf_record_2); } - void TearDown() override {} + void TearDown() override { } VCFMock vcf; std::shared_ptr vcf_record_1; @@ -1935,9 +1937,9 @@ class VCFTest___get_all_records_overlapping_the_given_record___Fixture { } - void SetUp() override {} + void SetUp() override { } - void TearDown() override {} + void TearDown() override { } VCF vcf; VCFRecord vcf_record_5_to_10; @@ -2140,7 +2142,7 @@ class VCFTest___header___Fixture : public ::testing::Test { .WillOnce(Return(std::string("dummy_date"))); } - void TearDown() override {} + void TearDown() override { } VCF_Mock vcf; }; @@ -2250,7 +2252,7 @@ class VCFTest___to_string___Fixture : public ::testing::Test { .WillOnce(Return(std::string("##Dummy_header;\n"))); } - void TearDown() override {} + void TearDown() override { } }; TEST_F(VCFTest___to_string___Fixture, graph_type_is_simple_sv_is_snp) @@ -2509,9 +2511,9 @@ TEST_F(VCFTest___save___Fixture, save_false_then_true_flags) TEST(VCFTest, concatenate_VCFs) { - VCF::concatenate_VCFs({ "../../test/test_cases/concatenate_VCFs/fake_vcf1.vcf", - "../../test/test_cases/concatenate_VCFs/fake_vcf2.vcf", - "../../test/test_cases/concatenate_VCFs/fake_vcf3.vcf" }, + VCF::concatenate_VCFs({ TEST_CASE_DIR + "concatenate_VCFs/fake_vcf1.vcf", + TEST_CASE_DIR + "concatenate_VCFs/fake_vcf2.vcf", + TEST_CASE_DIR + "concatenate_VCFs/fake_vcf3.vcf" }, "concatenated_vcf.vcf"); std::vector actual = get_vector_of_strings_from_file("concatenated_vcf.vcf");