Skip to content

Commit

Permalink
Merge pull request #3028 from vgteam/nested-call
Browse files Browse the repository at this point in the history
WIP: Nested caller
  • Loading branch information
glennhickey authored Feb 27, 2021
2 parents 6df9a3f + f63889b commit 292717e
Show file tree
Hide file tree
Showing 11 changed files with 1,095 additions and 84 deletions.
2 changes: 1 addition & 1 deletion src/algorithms/k_widest_paths.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ vector<pair<double, vector<handle_t>>> yens_k_widest_paths(const HandleGraph* g,
}
total_path.first = total_length > 0 ? total_support / total_length : 0;
}
pair<map<vector<handle_t>, size_t>::iterator, bool> ins = B.insert(make_pair(total_path.second, i + 1));
pair<map<vector<handle_t>, size_t>::iterator, bool> ins = B.insert(make_pair(total_path.second, i));
if (ins.second == true) {
score_to_B.insert(make_pair(total_path.first, ins.first));
} // todo: is there any reason we'd need to update the score of an existing entry in B?
Expand Down
780 changes: 746 additions & 34 deletions src/graph_caller.cpp

Large diffs are not rendered by default.

197 changes: 187 additions & 10 deletions src/graph_caller.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,20 +35,18 @@ class GraphCaller {
/// For any that return false, try the children, etc. (when recurse_on_fail true)
/// Snarls are processed in parallel
virtual void call_top_level_snarls(const HandleGraph& graph,
int ploidy,
bool recurse_on_fail = true);

/// For every chain, cut it up into pieces using max_edges and max_trivial to cap the size of each piece
/// then make a fake snarl for each chain piece and call it. If a fake snarl fails to call,
/// It's child chains will be recursed on (if selected)_
virtual void call_top_level_chains(const HandleGraph& graph,
int ploidy,
size_t max_edges,
size_t max_trivial,
bool recurse_on_fail = true);

/// Call a given snarl, and print the output to out_stream
virtual bool call_snarl(const Snarl& snarl, int ploidy) = 0;
virtual bool call_snarl(const Snarl& snarl) = 0;

protected:

Expand Down Expand Up @@ -84,11 +82,15 @@ class VCFOutputCaller {

protected:

/// print a vcf variant
/// convert a traversal into an allele string
string trav_string(const HandleGraph& graph, const SnarlTraversal& trav) const;

/// print a vcf variant
void emit_variant(const PathPositionHandleGraph& graph, SnarlCaller& snarl_caller,
const Snarl& snarl, const vector<SnarlTraversal>& called_traversals,
const vector<int>& genotype, int ref_trav_idx, const unique_ptr<SnarlCaller::CallInfo>& call_info,
const string& ref_path_name, int ref_offset, bool genotype_snarls) const;
const string& ref_path_name, int ref_offset, bool genotype_snarls, int ploidy,
function<string(const vector<SnarlTraversal>&, const vector<int>&, int, int, int)> trav_to_string = nullptr);

/// get the interval of a snarl from our reference path using the PathPositionHandleGraph interface
/// the bool is true if the snarl's backward on the path
Expand All @@ -98,6 +100,15 @@ class VCFOutputCaller {
/// clean up the alleles to not share common prefixes / suffixes
/// if len_override given, just do that many bases without thinking
void flatten_common_allele_ends(vcflib::Variant& variant, bool backward, size_t len_override) const;

/// print a snarl in a consistent form <SNARL(Start,END)>
string print_snarl(const Snarl& snarl, bool in_brackets = true) const;

/// do the opposite of above
/// So a string that looks like AACT<12_-17>TTT would invoke the callback three times with
/// ("AACT", Snarl), ("", Snarl(12,-17)), ("TTT", Snarl(12,-17))
/// The parameters are to be treated as unions: A sequence fragment if non-empty, otherwise a snarl
void scan_snarl(const string& allele_string, function<void(const string&, Snarl&)> callback) const;

/// output vcf
mutable vcflib::VariantCallFile output_vcf;
Expand Down Expand Up @@ -161,6 +172,7 @@ class VCFGenotyper : public GraphCaller, public VCFOutputCaller, public GAFOutpu
vcflib::VariantCallFile& variant_file,
const string& sample_name,
const vector<string>& ref_paths,
const vector<int>& ref_path_ploidies,
FastaReference* ref_fasta,
FastaReference* ins_fasta,
AlignmentEmitter* aln_emitter,
Expand All @@ -170,7 +182,7 @@ class VCFGenotyper : public GraphCaller, public VCFOutputCaller, public GAFOutpu

virtual ~VCFGenotyper();

virtual bool call_snarl(const Snarl& snarl, int ploidy);
virtual bool call_snarl(const Snarl& snarl);

virtual string vcf_header(const PathHandleGraph& graph, const vector<string>& contigs,
const vector<size_t>& contig_length_overrides = {}) const;
Expand Down Expand Up @@ -200,6 +212,9 @@ class VCFGenotyper : public GraphCaller, public VCFOutputCaller, public GAFOutpu

/// toggle whether to output vcf or gaf
bool gaf_output;

/// the ploidies
unordered_map<string, int> path_to_ploidy;
};


Expand All @@ -215,11 +230,12 @@ class LegacyCaller : public GraphCaller, public VCFOutputCaller {
SnarlManager& snarl_manager,
const string& sample_name,
const vector<string>& ref_paths = {},
const vector<size_t>& ref_path_offsets = {});
const vector<size_t>& ref_path_offsets = {},
const vector<int>& ref_path_ploidies = {});

virtual ~LegacyCaller();

virtual bool call_snarl(const Snarl& snarl, int ploidy);
virtual bool call_snarl(const Snarl& snarl);

virtual string vcf_header(const PathHandleGraph& graph, const vector<string>& contigs,
const vector<size_t>& contig_length_overrides = {}) const;
Expand Down Expand Up @@ -273,6 +289,9 @@ class LegacyCaller : public GraphCaller, public VCFOutputCaller {
/// keep track of offsets in the reference paths
map<string, size_t> ref_offsets;

/// keep track of ploidies in the reference paths
map<string, int> ref_ploidies;

/// Tuning

/// How many nodes should we be willing to look at on our path back to the
Expand Down Expand Up @@ -307,6 +326,7 @@ class FlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputC
TraversalFinder& traversal_finder,
const vector<string>& ref_paths,
const vector<size_t>& ref_path_offsets,
const vector<int>& ref_path_ploidies,
AlignmentEmitter* aln_emitter,
bool traversals_only,
bool gaf_output,
Expand All @@ -315,7 +335,7 @@ class FlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputC

virtual ~FlowCaller();

virtual bool call_snarl(const Snarl& snarl, int ploidy);
virtual bool call_snarl(const Snarl& snarl);

virtual string vcf_header(const PathHandleGraph& graph, const vector<string>& contigs,
const vector<size_t>& contig_length_overrides = {}) const;
Expand All @@ -334,9 +354,12 @@ class FlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputC

/// keep track of offsets in the reference paths
map<string, size_t> ref_offsets;

/// keep traco of the ploidies (todo: just one map for all path stuff!!)
map<string, int> ref_ploidies;

/// until we support nested snarls, cap snarl size we attempt to process
size_t max_snarl_edges = 500000;
size_t max_snarl_edges = 10000;

/// alignment emitter. if not null, traversals will be output here and
/// no genotyping will be done
Expand All @@ -354,6 +377,160 @@ class FlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputC
bool genotype_snarls;
};

class SnarlGraph;

/**
* FlowCaller : Uses any traversals finder (ex, FlowTraversalFinder) to find
* traversals, and calls those based on how much support they have.
* Should work on any graph but will not
* report cyclic traversals.
*
* todo: this is a generalization of FlowCaller and should be able to replace it entirely after testing
* to get rid of duplicated code.
*/
class NestedFlowCaller : public GraphCaller, public VCFOutputCaller, public GAFOutputCaller {
public:
NestedFlowCaller(const PathPositionHandleGraph& graph,
SupportBasedSnarlCaller& snarl_caller,
SnarlManager& snarl_manager,
const string& sample_name,
TraversalFinder& traversal_finder,
const vector<string>& ref_paths,
const vector<size_t>& ref_path_offsets,
const vector<int>& ref_path_ploidies,
AlignmentEmitter* aln_emitter,
bool traversals_only,
bool gaf_output,
size_t trav_padding,
bool genotype_snarls);

virtual ~NestedFlowCaller();

virtual bool call_snarl(const Snarl& snarl);

virtual string vcf_header(const PathHandleGraph& graph, const vector<string>& contigs,
const vector<size_t>& contig_length_overrides = {}) const;

protected:

/// stuff we remember for each snarl call, to be used when genotyping its parent
struct CallRecord {
vector<SnarlTraversal> travs;
vector<pair<vector<int>, unique_ptr<SnarlCaller::CallInfo>>> genotype_by_ploidy;
string ref_path_name;
pair<int64_t, int64_t> ref_path_interval;
int ref_trav_idx; // index of ref paths in CallRecord::travs
};
typedef map<Snarl, CallRecord, NestedCachedPackedTraversalSupportFinder::snarl_less> CallTable;

/// update the table of calls for each child snarl (and the input snarl)
bool call_snarl_recursive(const Snarl& managed_snarl, int ploidy,
const string& parent_ref_path_name, pair<size_t, size_t> parent_ref_path_interval,
CallTable& call_table);

/// emit the vcf of all reference-spanning snarls
/// The call_table needs to be completely resolved
bool emit_snarl_recursive(const Snarl& managed_snarl, int ploidy,
CallTable& call_table);

/// transform the nested allele string from something like AAC<6_10>TTT to
/// a proper string by recursively resolving the nested snarls into alleles
string flatten_reference_allele(const string& nested_allele, const CallTable& call_table) const;
string flatten_alt_allele(const string& nested_allele, int allele, int ploidy, const CallTable& call_table) const;

/// the graph
const PathPositionHandleGraph& graph;

/// the traversal finder
TraversalFinder& traversal_finder;

/// keep track of the reference paths
vector<string> ref_paths;
unordered_set<string> ref_path_set;

/// keep track of offsets in the reference paths
map<string, size_t> ref_offsets;

/// keep traco of the ploidies (todo: just one map for all path stuff!!)
map<string, int> ref_ploidies;

/// until we support nested snarls, cap snarl size we attempt to process
size_t max_snarl_shallow_size = 50000;

/// alignment emitter. if not null, traversals will be output here and
/// no genotyping will be done
AlignmentEmitter* alignment_emitter;

/// toggle whether to genotype or just output the traversals
bool traversals_only;

/// toggle whether to output vcf or gaf
bool gaf_output;

/// toggle whether to genotype every snarl
/// (by default, uncalled snarls are skipped, and coordinates are flattened
/// out to minimize variant size -- this turns all that off)
bool genotype_snarls;

/// a hook into the snarl_caller's nested support finder
NestedCachedPackedTraversalSupportFinder& nested_support_finder;
};


/** Simplification of a NetGraph that ignores chains. It is designed only for
traversal finding. Todo: generalize NestedFlowCaller to the point where we
can remove this and use NetGraph instead */
class SnarlGraph : virtual public HandleGraph {
public:
// note: can only deal with one snarl "level" at a time
SnarlGraph(const HandleGraph* backing_graph, SnarlManager& snarl_manager, vector<const Snarl*> snarls);

// go from node to snarl (first val false if not a snarl)
pair<bool, handle_t> node_to_snarl(handle_t handle) const;

// go from edge to snarl (first val false if not a virtual edge)
tuple<bool, handle_t, edge_t> edge_to_snarl_edge(edge_t edge) const;

// replace a snarl node with an actual snarl in the traversal
void embed_snarl(Visit& visit);
void embed_snarls(SnarlTraversal& traversal);

// replace a refpath through the snarl with the actual snarl in the traversal
// todo: this is a bed of a hack
void embed_ref_path_snarls(SnarlTraversal& traversal);

////////////////////////////////////////////////////////////////////////////
// Handle-based interface (which is all identical to backing graph)
////////////////////////////////////////////////////////////////////////////
bool has_node(nid_t node_id) const;
handle_t get_handle(const nid_t& node_id, bool is_reverse = false) const;
nid_t get_id(const handle_t& handle) const;
bool get_is_reverse(const handle_t& handle) const;
handle_t flip(const handle_t& handle) const;
size_t get_length(const handle_t& handle) const;
std::string get_sequence(const handle_t& handle) const;
size_t get_node_count() const;
nid_t min_node_id() const;
nid_t max_node_id() const;

protected:

bool for_each_handle_impl(const std::function<bool(const handle_t&)>& iteratee, bool parallel = false) const;

/// this is the only function that's changed to do anything different from the backing graph:
/// it is changed to "pass through" snarls by pretending there are edges from into snarl starts out of ends and
/// vice versa.
bool follow_edges_impl(const handle_t& handle, bool go_left, const std::function<bool(const handle_t&)>& iteratee) const;

/// the backing graph
const HandleGraph* backing_graph;

/// the snarl manager
SnarlManager& snarl_manager;

/// the snarls (indexed both ways). flag is true for original orientation
unordered_map<handle_t, pair<handle_t, bool>> snarls;
};


}
Expand Down
2 changes: 1 addition & 1 deletion src/snarl_caller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ void SupportBasedSnarlCaller::update_vcf_info(const Snarl& snarl,

}

const TraversalSupportFinder& SupportBasedSnarlCaller::get_support_finder() const {
TraversalSupportFinder& SupportBasedSnarlCaller::get_support_finder() const {
return support_finder;
}

Expand Down
2 changes: 1 addition & 1 deletion src/snarl_caller.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ class SupportBasedSnarlCaller : public SnarlCaller {
void set_min_supports(double min_mad_for_call, double min_support_for_call, double min_site_support);

/// Get the traversal support finder
const TraversalSupportFinder& get_support_finder() const;
TraversalSupportFinder& get_support_finder() const;

/// Get the minimum total support for call
virtual int get_min_total_support_for_call() const;
Expand Down
Loading

1 comment on commit 292717e

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 11429 seconds

Please sign in to comment.