Skip to content

Commit

Permalink
Merge pull request #4190 from vgteam/sample-contig-paths
Browse files Browse the repository at this point in the history
Recognize sample#contig paths in GFA to GBWTGraph
  • Loading branch information
jltsiren authored Dec 10, 2023
2 parents 710f00c + 60a5e42 commit faf6499
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 5 deletions.
19 changes: 16 additions & 3 deletions src/gbwtgraph_helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,38 +12,51 @@ gbwtgraph::GFAParsingParameters get_best_gbwtgraph_gfa_parsing_parameters() {
// Configure GBWTGraph GFA parsing to be as close to the vg GFA parser as we can get.
// TODO: Make it closer.
parameters.path_name_formats.clear();
// Parse panSN with a fragment after it.

// Parse panSN with a fragment after it (e.g. HG002#1#chr3#1).
parameters.path_name_formats.emplace_back(
gbwtgraph::GFAParsingParameters::PAN_SN_REGEX + "#([0-9][0-9]*)",
gbwtgraph::GFAParsingParameters::PAN_SN_FIELDS + "F",
gbwtgraph::GFAParsingParameters::PAN_SN_SENSE
);

// Parse panSN with a range after it as a normal but with a fragment based
// on start position.
// on start position (e.g. HG002#1#chr3[1566235] or HG002#1#chr3[1566235-2397571]).
parameters.path_name_formats.emplace_back(
gbwtgraph::GFAParsingParameters::PAN_SN_REGEX + "\\[([0-9][0-9]*)(-[0-9]*)?\\]",
gbwtgraph::GFAParsingParameters::PAN_SN_FIELDS + "F",
gbwtgraph::GFAParsingParameters::PAN_SN_SENSE
);
// Parse standard panSN as what we think that is

// Parse standard panSN as what we think that is (e.g. HG002#1#chr3).
parameters.path_name_formats.emplace_back(
gbwtgraph::GFAParsingParameters::PAN_SN_REGEX,
gbwtgraph::GFAParsingParameters::PAN_SN_FIELDS,
gbwtgraph::GFAParsingParameters::PAN_SN_SENSE
);

// Parse path names with a sample and a contig (e.g. GRCh38#chr3).
parameters.path_name_formats.emplace_back(
"(.*)#(.*)",
"XSC",
PathSense::HAPLOTYPE
);

// Parse paths with just a name and a range as generic paths with a contig
// and a fragment. Sample for generic paths gets provided automatically.
parameters.path_name_formats.emplace_back(
"(.*)\\[([0-9][0-9]*)(-[0-9]*)?\\]",
"XCF",
PathSense::GENERIC
);

// Parse paths with nothing to distinguish them the default way (as generic named paths)
parameters.path_name_formats.emplace_back(
gbwtgraph::GFAParsingParameters::DEFAULT_REGEX,
gbwtgraph::GFAParsingParameters::DEFAULT_FIELDS,
gbwtgraph::GFAParsingParameters::DEFAULT_SENSE
);

return parameters;
}

Expand Down
3 changes: 1 addition & 2 deletions test/t/37_vg_gbwt.t
Original file line number Diff line number Diff line change
Expand Up @@ -404,8 +404,7 @@ is "$(vg gbwt --tags -Z gfa3.gbz | grep reference_samples | cut -f 2)" "GRCh37 C
rm -f gfa.gbz gfa2.gbz gfa3.gbz tags.tsv

# Build a GBZ from a graph with a reference but no haplotype phase number
# TODO: When <https://github.com/vgteam/vg/issues/4110> is fixed, actually parse this as GFA
vg gbwt -g gfa.gbz --gbz-format -E -x graphs/gfa_two_part_reference.gfa
vg gbwt -g gfa.gbz --gbz-format -G graphs/gfa_two_part_reference.gfa
is "$(vg paths -M --reference-paths -x gfa.gbz | grep -v "^#" | cut -f4 | grep NO_HAPLOTYPE | wc -l)" "2" "GBZ can represent reference paths without haplotype numbers"

rm -f gfa.gbz
Expand Down

1 comment on commit faf6499

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 25326 seconds

Please sign in to comment.