From f631079df7beb9c65623b890d008589ab81f6674 Mon Sep 17 00:00:00 2001 From: andrewjpage Date: Tue, 22 Aug 2017 10:06:00 +0100 Subject: [PATCH 1/2] allow paralogs --- dist.ini | 2 +- lib/Bio/Roary.pm | 2 + lib/Bio/Roary/CommandLine/Roary.pm | 8 +- .../Roary/CommandLine/RoaryCoreAlignment.pm | 9 +- .../Roary/CommandLine/RoaryPostAnalysis.pm | 7 +- .../External/GeneAlignmentFromNucleotides.pm | 2 + lib/Bio/Roary/External/PostAnalysis.pm | 5 + .../Roary/ExtractCoreGenesFromSpreadsheet.pm | 227 +++++++++--------- lib/Bio/Roary/MergeMultifastaAlignments.pm | 2 +- t/Bio/Roary/ExtractCoreGenesFromSpreadsheet.t | 67 ++++-- 10 files changed, 193 insertions(+), 138 deletions(-) diff --git a/dist.ini b/dist.ini index 39fc6d2..4d8b414 100644 --- a/dist.ini +++ b/dist.ini @@ -1,5 +1,5 @@ name = Bio-Roary -version = 3.9.0 +version = 3.9.1 author = Andrew J. Page license = GPL_3 copyright_holder = Wellcome Trust Sanger Institute diff --git a/lib/Bio/Roary.pm b/lib/Bio/Roary.pm index feccc74..9ccd79a 100644 --- a/lib/Bio/Roary.pm +++ b/lib/Bio/Roary.pm @@ -48,6 +48,7 @@ has 'core_definition' => ( is => 'rw', isa => 'Num', default = has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'mafft' => ( is => 'ro', isa => 'Bool', default => 0 ); has 'inflation_value' => ( is => 'rw', isa => 'Num', default => 1.5 ); +has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 ); @@ -136,6 +137,7 @@ sub run { core_definition => $self->core_definition, verbose => $self->verbose, mafft => $self->mafft, + allow_paralogs => $self->allow_paralogs, ); $post_analysis->run(); diff --git a/lib/Bio/Roary/CommandLine/Roary.pm b/lib/Bio/Roary/CommandLine/Roary.pm index 8c8b0c8..110adbe 100644 --- a/lib/Bio/Roary/CommandLine/Roary.pm +++ b/lib/Bio/Roary/CommandLine/Roary.pm @@ -47,6 +47,7 @@ has 'dont_split_groups' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'verbose_stats' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 ); has 'mafft' => ( is => 'rw', isa => 'Bool', default => 0 ); +has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 ); has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 ); has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 ); @@ -71,7 +72,7 @@ sub BUILD { $job_runner, $makeblastdb_exec, $mcxdeblast_exec, $mcl_exec, $blastp_exec, $apply_unknowns_filter, $cpus, $output_multifasta_files, $verbose_stats, $translation_table, $run_qc, $core_definition, $help, $kraken_db, $cmd_version, - $mafft, $output_directory, $check_dependancies, $inflation_value, + $mafft, $output_directory, $check_dependancies, $inflation_value, $allow_paralogs, ); GetOptionsFromArray( @@ -98,6 +99,7 @@ sub BUILD { 'cd|core_definition=f' => \$core_definition, 'v|verbose' => \$verbose, 'n|mafft' => \$mafft, + 'ap|allow_paralogs' => \$allow_paralogs, 'k|kraken_db=s' => \$kraken_db, 'w|version' => \$cmd_version, 'a|check_dependancies' => \$check_dependancies, @@ -302,7 +304,8 @@ sub run { core_definition => $self->core_definition, verbose => $self->verbose, mafft => $self->mafft, - inflation_value => $self->inflation_value, + allow_paralogs => $self->allow_paralogs, + inflation_value => $self->inflation_value, ); $pan_genome_obj->run(); @@ -343,6 +346,7 @@ Options: -p INT number of threads [1] -r create R plots, requires R and ggplot2 -s dont split paralogs -t INT translation table [11] + -ap allow paralogs in core alignment -z dont delete intermediate files -v verbose output to STDOUT -w print version and exit diff --git a/lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm b/lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm index dc52a60..4359a35 100644 --- a/lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm +++ b/lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm @@ -27,13 +27,14 @@ has 'spreadsheet_filename' => ( is => 'rw', isa => 'Str', default => 'gene_ has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'core_gene_alignment.aln' ); has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 ); has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 ); +has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 ); has '_error_message' => ( is => 'rw', isa => 'Str' ); has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 ); sub BUILD { my ($self) = @_; - my ( $multifasta_base_directory, $spreadsheet_filename, $output_filename, $core_definition,$verbose, $help, $mafft, $dont_delete_files ); + my ( $multifasta_base_directory, $spreadsheet_filename, $output_filename, $core_definition,$verbose, $help, $mafft, $allow_paralogs, $dont_delete_files ); GetOptionsFromArray( $self->args, @@ -42,6 +43,7 @@ sub BUILD { 'o|output_filename=s' => \$output_filename, 'cd|core_definition=f' => \$core_definition, 'z|dont_delete_files' => \$dont_delete_files, + 'p|allow_paralogs' => \$allow_paralogs, 'v|verbose' => \$verbose, 'h|help' => \$help, ); @@ -51,6 +53,7 @@ sub BUILD { $self->logger->level(10000); } $self->help($help) if(defined($help)); + $self->allow_paralogs($allow_paralogs) if(defined($allow_paralogs)); if ( defined($multifasta_base_directory) && ( -d $multifasta_base_directory ) ) { $self->multifasta_base_directory( abs_path($multifasta_base_directory)); @@ -95,7 +98,8 @@ sub run { $self->logger->info("Extract core genes from spreadsheet"); my $core_genes_obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new( spreadsheet => $self->spreadsheet_filename, - core_definition => $self->core_definition + core_definition => $self->core_definition, + allow_paralogs => $self->allow_paralogs ); $self->logger->info("Looking up genes in files"); @@ -130,6 +134,7 @@ Options: -o STR output filename [core_gene_alignment.aln] -cd FLOAT percentage of isolates a gene must be in to be core [99] -m STR directory containing gene multi-FASTAs [pan_genome_sequences] -s STR gene presence and absence spreadsheet [gene_presence_absence.csv] + -p allow paralogs -z dont delete intermediate files -v verbose output to STDOUT -h this help message diff --git a/lib/Bio/Roary/CommandLine/RoaryPostAnalysis.pm b/lib/Bio/Roary/CommandLine/RoaryPostAnalysis.pm index f6a59d0..fdbfbd6 100644 --- a/lib/Bio/Roary/CommandLine/RoaryPostAnalysis.pm +++ b/lib/Bio/Roary/CommandLine/RoaryPostAnalysis.pm @@ -41,6 +41,7 @@ has 'group_limit' => ( is => 'rw', isa => 'Num', default => 500 has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 ); has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'mafft' => ( is => 'rw', isa => 'Bool', default => 0 ); +has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 ); sub BUILD { my ($self) = @_; @@ -48,7 +49,7 @@ sub BUILD { my ( $output_filename, $dont_create_rplots, $dont_delete_files, $dont_split_groups, $output_pan_geneome_filename, $job_runner, $output_statistics_filename, $output_multifasta_files, $clusters_filename, $core_definition, - $fasta_files, $input_files, $verbose_stats, $translation_table, $help, $cpus,$group_limit,$verbose,$mafft + $fasta_files, $input_files, $verbose_stats, $translation_table, $help, $cpus,$group_limit,$verbose,$mafft, $allow_paralogs ); @@ -72,6 +73,7 @@ sub BUILD { 'cd|core_definition=f' => \$core_definition, 'v|verbose' => \$verbose, 'n|mafft' => \$mafft, + 'q|allow_paralogs' => \$allow_paralogs, 'h|help' => \$help, ); @@ -93,6 +95,7 @@ sub BUILD { $self->group_limit($group_limit) if ( defined($group_limit) ); $self->core_definition( $core_definition/100 ) if ( defined($core_definition) ); $self->mafft($mafft) if ( defined($mafft) ); + $self->allow_paralogs($allow_paralogs) if ( defined($allow_paralogs) ); if ( defined($verbose) ) { $self->verbose($verbose); $self->logger->level(10000); @@ -158,6 +161,7 @@ sub run { cpus => $self->cpus, verbose => $self->verbose, mafft => $self->mafft, + allow_paralogs => $self->allow_paralogs, dont_delete_files => $self->dont_delete_files, num_input_files => $#{$input_files}, ); @@ -222,6 +226,7 @@ Options: -a dont delete intermediate files -n fast core gene alignement with MAFFT instead of PRANK -o STR clusters output filename [clustered_proteins] -p STR output pan genome filename [pan_genome.fa] + -q allow paralogs in core alignment -s STR output gene presence and absence filename [gene_presence_absence.csv] -t INT translation table [11] -z INT number of threads [1] diff --git a/lib/Bio/Roary/External/GeneAlignmentFromNucleotides.pm b/lib/Bio/Roary/External/GeneAlignmentFromNucleotides.pm index 122628a..3f65daa 100644 --- a/lib/Bio/Roary/External/GeneAlignmentFromNucleotides.pm +++ b/lib/Bio/Roary/External/GeneAlignmentFromNucleotides.pm @@ -29,6 +29,7 @@ has 'translation_table' => ( is => 'rw', isa => 'Int', default => has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1 ); has 'mafft' => ( is => 'ro', isa => 'Bool', default => 0 ); has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 ); +has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'num_input_files' => ( is => 'ro', isa => 'Int', required => 1); # Overload Role` @@ -85,6 +86,7 @@ sub _build__core_alignment_cmd { my $core_cmd = "pan_genome_core_alignment"; $core_cmd .= " -cd " . ($self->core_definition*100) if ( defined $self->core_definition ); $core_cmd .= " --dont_delete_files " if ( defined $self->dont_delete_files && $self->dont_delete_files == 1 ); + $core_cmd .= " --allow_paralogs " if ( defined $self->allow_paralogs && $self->allow_paralogs == 1 ); return $core_cmd; } diff --git a/lib/Bio/Roary/External/PostAnalysis.pm b/lib/Bio/Roary/External/PostAnalysis.pm index 211b746..1de663a 100644 --- a/lib/Bio/Roary/External/PostAnalysis.pm +++ b/lib/Bio/Roary/External/PostAnalysis.pm @@ -37,6 +37,7 @@ has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50 has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1.0 ); has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 ); has 'mafft' => ( is => 'ro', isa => 'Bool', default => 0 ); +has 'allow_paralogs' => ( is => 'ro', isa => 'Bool', default => 0 ); has '_working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } ); has '_gff_fofn' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__gff_fofn' ); has '_fasta_fofn' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__fasta_fofn' ); @@ -137,6 +138,9 @@ sub _command_to_run { my $verbose_flag = ''; $verbose_flag = '-v' if ( defined($self->verbose) && $self->verbose == 1 ); + + my $allow_paralogs_flag = ''; + $allow_paralogs_flag = '--allow_paralogs' if ( defined($self->allow_paralogs) && $self->allow_paralogs == 1 ); return join( " ", @@ -156,6 +160,7 @@ sub _command_to_run { $verbose_stats_flag, $verbose_flag, $mafft_flag, + $allow_paralogs_flag, '-j', $self->job_runner, '--processors', $self->cpus, '--group_limit', $self->group_limit, diff --git a/lib/Bio/Roary/ExtractCoreGenesFromSpreadsheet.pm b/lib/Bio/Roary/ExtractCoreGenesFromSpreadsheet.pm index 9fa6fed..7c898fc 100644 --- a/lib/Bio/Roary/ExtractCoreGenesFromSpreadsheet.pm +++ b/lib/Bio/Roary/ExtractCoreGenesFromSpreadsheet.pm @@ -19,33 +19,33 @@ use Text::CSV; use Bio::Roary::GroupStatistics; use POSIX; -has 'spreadsheet' => ( is => 'ro', isa => 'Str', required => 1 ); -has '_csv_parser' => ( is => 'ro', isa => 'Text::CSV',lazy => 1, builder => '_build__csv_parser' ); -has '_input_spreadsheet_fh' => ( is => 'ro', lazy => 1, builder => '_build__input_spreadsheet_fh' ); -has 'ordered_core_genes' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_ordered_core_genes' ); -has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1 ); -has 'sample_names' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} ); -has 'sample_names_to_genes' => ( is => 'rw', isa => 'HashRef', default => sub {{}} ); - -has '_number_of_isolates' => ( is => 'rw', isa => 'Int'); -has '_gene_column' => ( is => 'rw', isa => 'Int'); -has '_num_isolates_column' => ( is => 'rw', isa => 'Int'); -has '_avg_sequences_per_isolate_column' => ( is => 'rw', isa => 'Int'); -has '_genome_fragement_column' => ( is => 'rw', isa => 'Int'); -has '_order_within_fragement_column' => ( is => 'rw', isa => 'Int'); -has '_min_no_isolates_for_core' => ( is => 'rw', isa => 'Num', lazy => 1, builder => '_build__min_no_isolates_for_core' ); +has 'spreadsheet' => ( is => 'ro', isa => 'Str', required => 1 ); +has '_csv_parser' => ( is => 'ro', isa => 'Text::CSV', lazy => 1, builder => '_build__csv_parser' ); +has '_input_spreadsheet_fh' => ( is => 'ro', lazy => 1, builder => '_build__input_spreadsheet_fh' ); +has 'ordered_core_genes' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_ordered_core_genes' ); +has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1 ); +has 'sample_names' => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } ); +has 'sample_names_to_genes' => ( is => 'rw', isa => 'HashRef', default => sub { {} } ); +has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 ); + +has '_number_of_isolates' => ( is => 'rw', isa => 'Int' ); +has '_gene_column' => ( is => 'rw', isa => 'Int' ); +has '_num_isolates_column' => ( is => 'rw', isa => 'Int' ); +has '_avg_sequences_per_isolate_column' => ( is => 'rw', isa => 'Int' ); +has '_genome_fragement_column' => ( is => 'rw', isa => 'Int' ); +has '_order_within_fragement_column' => ( is => 'rw', isa => 'Int' ); +has '_min_no_isolates_for_core' => ( is => 'rw', isa => 'Num', lazy => 1, builder => '_build__min_no_isolates_for_core' ); sub _build__min_no_isolates_for_core { - my ($self) = @_; - my $threshold = $self->_number_of_isolates * $self->core_definition; + my ($self) = @_; + my $threshold = $self->_number_of_isolates * $self->core_definition; - return $threshold; + return $threshold; } -sub _build__csv_parser -{ - my ($self) = @_; - return Text::CSV->new( { binary => 1, always_quote => 1} ); +sub _build__csv_parser { + my ($self) = @_; + return Text::CSV->new( { binary => 1, always_quote => 1 } ); } sub _build__input_spreadsheet_fh { @@ -54,115 +54,112 @@ sub _build__input_spreadsheet_fh { return $fh; } -sub _update_number_of_isolates -{ - my ($self, $header_row) = @_; - my $number_of_isolates = @{$header_row} - @{Bio::Roary::GroupStatistics->fixed_headers}; - $self->_number_of_isolates($number_of_isolates); +sub _update_number_of_isolates { + my ( $self, $header_row ) = @_; + my $number_of_isolates = @{$header_row} - @{ Bio::Roary::GroupStatistics->fixed_headers }; + $self->_number_of_isolates($number_of_isolates); } -sub _setup_column_mappings -{ - my ($self, $header_row) = @_; - # current ordering - my %columns_of_interest_mappings = ( - 'Gene' => 0, - 'No. isolates' => 3, - 'Avg sequences per isolate' => 5, - 'Genome Fragment' => 6, - 'Order within Fragment' => 7, - 'QC' => 10, +sub _setup_column_mappings { + my ( $self, $header_row ) = @_; + + # current ordering + my %columns_of_interest_mappings = ( + 'Gene' => 0, + 'No. isolates' => 3, + 'Avg sequences per isolate' => 5, + 'Genome Fragment' => 6, + 'Order within Fragment' => 7, + 'QC' => 10, ); - - # Dynamically overwrite the default ordering - for(my $i = 0; $i < @{$header_row}; $i++) - { - for my $col_name (%columns_of_interest_mappings) - { - if($header_row->[$i] eq $col_name) - { - $columns_of_interest_mappings{$col_name} = $i; - last; - } + + # Dynamically overwrite the default ordering + for ( my $i = 0 ; $i < @{$header_row} ; $i++ ) { + for my $col_name (%columns_of_interest_mappings) { + if ( $header_row->[$i] eq $col_name ) { + $columns_of_interest_mappings{$col_name} = $i; + last; + } + } } - } - $self->_gene_column($columns_of_interest_mappings{'Gene'}); - $self->_num_isolates_column($columns_of_interest_mappings{'No. isolates'}); - $self->_avg_sequences_per_isolate_column($columns_of_interest_mappings{'Avg sequences per isolate'}); - $self->_genome_fragement_column($columns_of_interest_mappings{'Genome Fragment'}); - $self->_order_within_fragement_column($columns_of_interest_mappings{'Order within Fragment'}); - $self->_update_number_of_isolates($header_row); - - # Get the sample_names - my @sample_names; - for(my $i = $self->_length_of_fixed_headers(); $i < @{$header_row}; $i++) - { - push(@sample_names,$header_row->[$i]); - } - $self->sample_names(\@sample_names); + $self->_gene_column( $columns_of_interest_mappings{'Gene'} ); + $self->_num_isolates_column( $columns_of_interest_mappings{'No. isolates'} ); + $self->_avg_sequences_per_isolate_column( $columns_of_interest_mappings{'Avg sequences per isolate'} ); + $self->_genome_fragement_column( $columns_of_interest_mappings{'Genome Fragment'} ); + $self->_order_within_fragement_column( $columns_of_interest_mappings{'Order within Fragment'} ); + $self->_update_number_of_isolates($header_row); + + # Get the sample_names + my @sample_names; + for ( my $i = $self->_length_of_fixed_headers() ; $i < @{$header_row} ; $i++ ) { + push( @sample_names, $header_row->[$i] ); + } + $self->sample_names( \@sample_names ); } -sub _length_of_fixed_headers -{ - my ($self) = @_; - return @{Bio::Roary::GroupStatistics->fixed_headers()}; +sub _length_of_fixed_headers { + my ($self) = @_; + return @{ Bio::Roary::GroupStatistics->fixed_headers() }; } -sub _populate_sample_to_gene_lookup_with_row -{ - my ($self, $row) = @_; - - for(my $i = $self->_length_of_fixed_headers(); $i < @{$row}; $i++ ) - { - if(defined($row->[$i]) && $row->[$i] ne "" ) - { - my $sample_name = $self->sample_names->[$i - $self->_length_of_fixed_headers()]; - - $self->sample_names_to_genes->{$sample_name}->{$row->[$i]} = 1; - } - } - return 1; -} +sub _populate_sample_to_gene_lookup_with_row { + my ( $self, $row ) = @_; + for ( my $i = $self->_length_of_fixed_headers() ; $i < @{$row} ; $i++ ) { + if ( defined( $row->[$i] ) && $row->[$i] ne "" ) { + my $sample_name = $self->sample_names->[ $i - $self->_length_of_fixed_headers() ]; -sub _ordered_core_genes -{ - my ($self) = @_; - my %ordered_genes; - while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) ) - { - next if(@{$row} < 12); # no genes in group - next if(!defined($row->[$self->_gene_column]) || $row->[$self->_gene_column] eq '' ); # no gene name - next if(!defined($row->[$self->_avg_sequences_per_isolate_column]) || $row->[$self->_avg_sequences_per_isolate_column] eq '' ); # no average - next if(!defined($row->[$self->_genome_fragement_column]) || $row->[$self->_genome_fragement_column] eq '' ); # fragment not defined - - # next if($self->_number_of_isolates != $row->[$self->_num_isolates_column]); # if gene is not in all isolates - next if ( $row->[$self->_num_isolates_column] < $self->_min_no_isolates_for_core ); - next if($row->[$self->_avg_sequences_per_isolate_column] != 1); - $ordered_genes{$row->[$self->_genome_fragement_column]}{$row->[$self->_order_within_fragement_column]} = $row->[$self->_gene_column]; - $self->_populate_sample_to_gene_lookup_with_row($row); - } - - my @ordered_core_genes ; - for my $fragment_key(sort {$a <=> $b } keys %ordered_genes) - { - for my $order_within_fragement(sort {$a <=> $b } keys %{$ordered_genes{$fragment_key}}) - { - push(@ordered_core_genes,$ordered_genes{$fragment_key}{$order_within_fragement}); + $self->sample_names_to_genes->{$sample_name}->{ $row->[$i] } = 1; + } } - } - return \@ordered_core_genes; + return 1; } -sub _build_ordered_core_genes -{ - my ($self) = @_; - my $header_row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ); - $self->_setup_column_mappings($header_row); +sub _ordered_core_genes { + my ($self) = @_; + my %ordered_genes; + while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) ) { + next if ( @{$row} < 12 ); # no genes in group + next if ( !defined( $row->[ $self->_gene_column ] ) || $row->[ $self->_gene_column ] eq '' ); # no gene name + next + if ( !defined( $row->[ $self->_avg_sequences_per_isolate_column ] ) || $row->[ $self->_avg_sequences_per_isolate_column ] eq '' ) + ; # no average + next + if ( !defined( $row->[ $self->_genome_fragement_column ] ) || $row->[ $self->_genome_fragement_column ] eq '' ) + ; # fragment not defined + + # next if($self->_number_of_isolates != $row->[$self->_num_isolates_column]); # if gene is not in all isolates + next if ( $row->[ $self->_num_isolates_column ] < $self->_min_no_isolates_for_core ); + + if ( $self->allow_paralogs ) { + # should never happen + next if ( $row->[ $self->_avg_sequences_per_isolate_column ] < 1 ); + } + else { + next if ( $row->[ $self->_avg_sequences_per_isolate_column ] != 1 ); + } + + $ordered_genes{ $row->[ $self->_genome_fragement_column ] }{ $row->[ $self->_order_within_fragement_column ] } = + $row->[ $self->_gene_column ]; + $self->_populate_sample_to_gene_lookup_with_row($row); + } - return $self->_ordered_core_genes(); + my @ordered_core_genes; + for my $fragment_key ( sort { $a <=> $b } keys %ordered_genes ) { + for my $order_within_fragement ( sort { $a <=> $b } keys %{ $ordered_genes{$fragment_key} } ) { + push( @ordered_core_genes, $ordered_genes{$fragment_key}{$order_within_fragement} ); + } + } + return \@ordered_core_genes; } +sub _build_ordered_core_genes { + my ($self) = @_; + my $header_row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ); + $self->_setup_column_mappings($header_row); + + return $self->_ordered_core_genes(); +} no Moose; __PACKAGE__->meta->make_immutable; diff --git a/lib/Bio/Roary/MergeMultifastaAlignments.pm b/lib/Bio/Roary/MergeMultifastaAlignments.pm index d2dd55e..f74d4ac 100644 --- a/lib/Bio/Roary/MergeMultifastaAlignments.pm +++ b/lib/Bio/Roary/MergeMultifastaAlignments.pm @@ -72,7 +72,7 @@ sub _sequence_for_sample_from_gene_file { my ( $self, $sample_name, $gene_file ) = @_; # loop over this to get the geneIDs - for my $gene_id ( keys %{ $self->_gene_to_sequence->{$gene_file} } ) { + for my $gene_id ( sort keys %{ $self->_gene_to_sequence->{$gene_file} } ) { if ( defined( $self->sample_names_to_genes->{$sample_name}->{$gene_id} ) ) { return $self->_gene_to_sequence->{$gene_file}->{$gene_id}; } diff --git a/t/Bio/Roary/ExtractCoreGenesFromSpreadsheet.t b/t/Bio/Roary/ExtractCoreGenesFromSpreadsheet.t index 92c747d..edfc2b3 100755 --- a/t/Bio/Roary/ExtractCoreGenesFromSpreadsheet.t +++ b/t/Bio/Roary/ExtractCoreGenesFromSpreadsheet.t @@ -13,21 +13,56 @@ BEGIN { my $obj; -ok($obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new( - spreadsheet => 't/data/core_group_statistics.csv', -),'initalise obj'); -is_deeply($obj->ordered_core_genes, ['argF','speH','group_5'], 'Correct ordering'); -is_deeply($obj->sample_names_to_genes, { - 'query_2' => { - '2_3' => 1, - '2_7' => 1, - '2_2' => 1 - }, - 'query_1' => { - '1_6' => 1, - '1_3' => 1, - '1_2' => 1 - } - }, 'Correct of sample names to genes is correct'); +ok( + $obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new( + spreadsheet => 't/data/core_group_statistics.csv', + ), + 'initalise obj' +); +is_deeply( $obj->ordered_core_genes, [ 'argF', 'speH', 'group_5' ], 'Correct ordering' ); +is_deeply( + $obj->sample_names_to_genes, + { + 'query_2' => { + '2_3' => 1, + '2_7' => 1, + '2_2' => 1 + }, + 'query_1' => { + '1_6' => 1, + '1_3' => 1, + '1_2' => 1 + } + }, + 'Correct of sample names to genes is correct' +); + +ok( + $obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new( + spreadsheet => 't/data/core_group_statistics.csv', + allow_paralogs => 1, + ), + 'initalise obj where paralogs allowed' +); +is_deeply( $obj->ordered_core_genes, [ 'argF', 'hly', 'speH', 'group_5' ], 'Correct ordering where paralogs allowed' ); + +is_deeply( + $obj->sample_names_to_genes, + { + 'query_2' => { + '2_3' => 1, + '2_7' => 1, + '2_1' => 1, + '2_2' => 1 + }, + 'query_1' => { + '1_6' => 1, + '1_3' => 1, + '1_1' => 1, + '1_2' => 1 + } + }, + 'Correct of sample names to genes is correct where paralogs allowed' +); done_testing(); From 8d34a81bf3826458f64aad5d3049933eedf5f0a0 Mon Sep 17 00:00:00 2001 From: andrewjpage Date: Tue, 22 Aug 2017 10:52:53 +0100 Subject: [PATCH 2/2] fix usage --- lib/Bio/Roary/CommandLine/Roary.pm | 4 ++-- lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/Bio/Roary/CommandLine/Roary.pm b/lib/Bio/Roary/CommandLine/Roary.pm index 110adbe..3052b6c 100644 --- a/lib/Bio/Roary/CommandLine/Roary.pm +++ b/lib/Bio/Roary/CommandLine/Roary.pm @@ -346,12 +346,12 @@ Options: -p INT number of threads [1] -r create R plots, requires R and ggplot2 -s dont split paralogs -t INT translation table [11] - -ap allow paralogs in core alignment + -ap allow paralogs in core alignment -z dont delete intermediate files -v verbose output to STDOUT -w print version and exit -y add gene inference information to spreadsheet, doesnt work with -e - -iv STR Change the MCL inflation value [1.5] + -iv STR Change the MCL inflation value [1.5] -h this help message Example: Quickly generate a core gene alignment using 8 threads diff --git a/lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm b/lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm index 4359a35..f586a27 100644 --- a/lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm +++ b/lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm @@ -134,7 +134,7 @@ Options: -o STR output filename [core_gene_alignment.aln] -cd FLOAT percentage of isolates a gene must be in to be core [99] -m STR directory containing gene multi-FASTAs [pan_genome_sequences] -s STR gene presence and absence spreadsheet [gene_presence_absence.csv] - -p allow paralogs + -p allow paralogs -z dont delete intermediate files -v verbose output to STDOUT -h this help message