#!/usr/bin/perl
#
# Module for data-mining. This is a restricted version based on our published results. As soon as more of my
# results using this module are published, I will include more subroutines.
#
# Note: All the files from a given type and organisms must be in the name of the organism as will be specified in the query

# algorithm for data-mining
sub mine_data {
	my $dir_file = shift @_;
	my @g_names = split / /, shift @_;
	my($out_file, @groups) = @_;
	my @in_files = dir_or_file($dir_file);
	my $nuc_fix = 0;
	$nuc_fix = 30 unless $dir_file =~ /.*\/CDS\/$/;	# more stringent for nucleotides

 	foreach my $in_file (@in_files) {
		my $count = 0;
		foreach my $group (@groups) {
			++$count;
			my $out_path = ">".$g_names[$count-1]."_".$out_file;
			$out_path = ">".$out_path if -e $g_names[$count-1]."_".$out_file;
			open OUT, $out_path;
       			print OUT "Group correction algorithm\n\n";

			# first we get the alignment of all the sequences in the reference alignment
			my $in = Bio::AlignIO->new(-file => "<$in_file",
						'-format' => 'clustalw');
			my $ref_aln = $in->next_aln();

			my $num_seq = $ref_aln->no_sequences();	# number of sequences in the overall alignment

			# and we get its statistics
			undef my @ref_matches;
			undef my @ref_strong;
			undef my @ref_weak;
			undef my @ref_mismatches;
			my $ref_sim = align_stats($ref_aln, \@ref_matches, \@ref_strong, \@ref_weak, \@ref_mismatches);
			my $ref_consensus = $ref_aln->consensus_string(30+$nuc_fix);

			# then we keep only the sequences under study (@$group)
			re_align(1, \$ref_aln, @$group);

			# and get their stats
			undef my @query_matches;
			undef my @query_strong;
			undef my @query_weak;
			undef my @query_mismatches;
			my $query_sim = align_stats($ref_aln, \@query_matches, \@query_strong, \@query_weak, \@query_mismatches);
			my $query_consensus = $ref_aln->consensus_string(30+$nuc_fix);
			my $control_consensus = $ref_aln->consensus_string(60+$nuc_fix);

			# if the query sequences are very divergent, we use a less stringent threshold
			if ($query_sim < 60) {
				@query_matches = (@query_matches, @query_strong, @query_weak);
			} elsif ($query_sim < 70) {
				@query_matches = (@query_matches, @query_strong);
			} elsif ($query_sim > 90) {
				@query_mismatches = (@query_mismatches, @query_weak);
			}

			# the data-mining is divided into finding disrupted funcitonal motifs and finding novel motifs

			# to find novel functional motifs, we check whether the conserved sequences in
			# the query sequence are different in the other sequences

			# we check the sequences without those in the query
			my $new_in = Bio::AlignIO->new(-file => "<$in_file",
						'-format' => 'clustalw');
			my $comp_aln = $new_in->next_aln();
			re_align(0, \$comp_aln, @$group);	# alignment with all sequences except those in the query

			# probably a bug in Bioperl obliges me to use this system
			save_align("temp.txt", $comp_aln);
			$new_in = Bio::AlignIO->new(-file => "<temp.txt",
					'-format' => 'clustalw');
			$comp_aln = $new_in->next_aln();

			undef my @comp_matches;
			undef my @comp_strong;
			undef my @comp_weak;
			undef my @comp_mismatches;
			my $comp_sim = align_stats($comp_aln, \@comp_matches, \@comp_strong, \@comp_weak, \@comp_mismatches);
			my $comp_seq = $comp_aln->consensus_string(40+$nuc_fix);

			# we exclude the positions that are similar between both query and reference alignments
			undef my @false_positives;
			for (0..length($query_consensus)) {
				push @false_positives, $_ if (substr($query_consensus, $_, 1) eq substr($comp_seq, $_, 1));
				push @false_mismatches, $_ if (substr($query_consensus, $_, 1) ne "?");
				push @false_positives, $_ if (substr($control_consensus, $_, 1) eq "?");
				push @false_positives, $_ if (substr($ref_consensus, $_, 1) eq "?" && $query_sim > 80);
			}
			my @results_novel = unique_positions(\@query_matches, \@false_positives, 0, 0);

			# we also exclude the positions that are repeated in the sequences not in the query
			undef @false_positives;
			for (1..($comp_aln->no_sequences())) {
				my $seq = $comp_aln->get_seq_by_pos($_);
				my $seq_string = $seq->seq();
				for (0..length($ref_consensus)) {
					push @false_positives, $_ if substr($query_consensus, $_, 1) eq substr($seq_string, $_, 1);
				}
			}
			@results_novel = unique_positions(\@results_novel, \@false_positives, $num_seq, 10);

			# and we can exclude the positions that are not conserved in the reference alignment
			# this is advizable if using very similar query sequences
			if ($query_sim > 80) {
				@results_novel = unique_positions(\@results_novel, \@comp_mismatches, 0, 0);
			} elsif ($query_sim > 90) {
				@results_novel = unique_positions(\@results_novel, \@comp_weak, 0, 0);
			}

	      		# findinding disrupted functional motifs takes the mismatches of our query and
			# checks whether these are conserved in the other sequences

			my @results_disrupted = unique_positions(\@query_mismatches, \@comp_mismatches, 0, 0);
			@results_disrupted = unique_positions(\@results_disrupted, \@false_mismatches, 0, 0);

			# to save the resulting alignment to file (useful as a control)
			#save_align("m".$out_file, $ref_aln);

			# print out results
			$in_file =~ /^.*\/(\S+)al.fa$/;
			my $name = $1;

			print OUT "  ### ".$g_names[$count-1]." group ###\n\n";
			print OUT "$name (length = ".length($query_consensus)."): $ref_sim% identity in reference alignment and $query_sim% in query\n\n";
			evolutionary_selection(\@query_matches, \@query_strong, \@query_weak, \@query_mismatches, \@ref_matches, \@ref_strong, \@ref_weak, \@ref_mismatches) unless $nuc_fix == 30 || scalar(@$group) <= 1;
			output_unique($name, "+", $query_consensus, \@ref_matches, \@results_novel);
			output_unique($name, "-", $query_consensus, \@ref_matches, \@results_disrupted);
			print OUT "\n";
			close OUT;
		}
     	}
	unlink "temp.txt";
}

# simple subroutine to save alignments, used mostly as a control
sub save_align {
	my($out_file, $aln) = @_;
	my $out  = Bio::AlignIO->new(-file => ">$out_file",
			'-format' => 'clustalw');
	$out->write_aln($aln);
}

# subroutine to print out the results into OUT, which must be previously opened
sub output_unique {
	my($name, $type, $consensus, $temp_m, $temp_r) = @_;
	my @results = @$temp_r;
	my @matches = @$temp_m;
	if (@results && scalar(@results) != 0) {
		if ($type eq "-") {
			print OUT scalar(@results)." unique mismatches found in $name (".int(length($consensus)+1)." aa): ";
			$type = "|";
		} else {
			print OUT scalar(@results)." unique aa found in $name (".int(length($consensus)+1)." aa): ";
			$type = "^";
		}
		foreach (1..scalar(@results)) {
			$results[$_-1] += 1;
		}
		foreach (1..scalar(@matches)) {
			$matches[$_-1] += 1;
		}
		@results = sort s_by_number @results;
		print OUT "@results"."\n";
		print OUT "Resulting sequence:\n";
		if (length($consensus) > 50) {
			for my $count (0..int(length($consensus)/60)) {
			  	print OUT substr($consensus, $count*60, 60)."\n";
				my $cursor = 0;
				# we print the results
				foreach (@results) {
					unless ($_ <= $count*60 || $_ > $count*60+60) {
						printf OUT "%".($_-$count*60-$cursor)."s", $type;						$cursor = $_-$count*60;
					}
				}
				print OUT "\n" unless $cursor == 0;
				# we print the conserved regions in the reference alignment
				#my $cons_cursor = 0;
				$cursor = 0;
				foreach (@matches) {
					unless ($_ <= $count*60 || $_ > $count*60+60) {
						printf OUT "%".($_-$count*60-$cursor)."s", "-";						$cursor = $_-$count*60;
					}
				}
				print OUT "\n" unless $cursor == 0;
			}
		} else {
			print OUT $consensus."\n";
		}

		print OUT "\n";
	} else {
		if ($type eq "-") {
			print OUT "No unique mismatches found in $name\n";
		} else {
			print OUT "No unique matches found in $name\n";
		}
	}
}

sub s_by_number { $a <=> $b }

# subroutine to check matches between two aligments
sub compare_matches {
	my($ref, $query) = @_;
	undef my @found;
	foreach my $ref_pos (@$ref) {
		foreach my $pos (@$query) {
			push @found, $pos if ($pos == $ref_pos);
		}
	}
	return @found;
}

#subroutine that removes ($remove_or_keep == 0) or keeps ($remove_or_keep == 1) all sequences except those names in @names from an aligment
sub re_align {
	my($remove_or_keep, $aln, @names) = @_;
	my $num_seq = $$aln->no_sequences();
	for (0..($$aln->no_sequences()-1)) {
		my $seq = $$aln->get_seq_by_pos($num_seq-$_);
		my $flag = 0;
		foreach (@names) {
			$flag = 1 if $seq->id() eq $_;
		}
		$$aln->remove_seq($seq) unless $flag == $remove_or_keep;
	}
}

# subroutine to find unique mismatches
# it returns the numbers in $query not found in $ref
sub unique_positions {
	my($query, $ref, $num_seq, $threshold) = @_;
	undef my @results;
	foreach my $f_pos (@$query){
		my $flag = -($num_seq-1)*($threshold/100);
		foreach my $ref_pos (@$ref) {
			++$flag if $f_pos == $ref_pos;
		}
		push @results, $f_pos unless $flag > 0;
	}
	return @results;
}

# to get a sequence's string from an aligment just use the next subroutine
sub get_sequence {
	my($name, $aln) = @_;
	foreach my $seq ( $$aln->each_seq_with_id($name) ) {
		return $seq->seq() if $seq->id() eq $name;
	}
}

# checks whether the query is a file or a directory and returns the file(s) to query in an array
# it also checks which files in a directory are alignments in ARCT general format
sub dir_or_file {
	my($dir_file) = @_;

	if (-d $dir_file) {
		$dir_file .= "/" unless ($dir_file =~ /\/$/);
	}

	undef my @in_files;
	if ($dir_file =~ /\/$/) {
		my @files = <$dir_file*>;
	      	foreach (@files) {
			push @in_files, $_ if (/al\.fa$/ || /\.clw$/);	# finds alignments
		}
	} else {
		push @in_files, $dir_file;
	}
	return @in_files;
}

# subroutine to analyse an aligment for each type of matches and mismatches
# it returns the identity of the alignment
sub align_stats {
	my($aln, $mt, $st, $wk, $ms) = @_;
	my $line = $aln->match_line();
	for (0..(length($line)-1)) {
		my $char = substr($line, $_, 1);
		if ($char eq ":") {
			push @$st, $_;
		} elsif ($char eq ".") {
			push @$wk, $_;
		} elsif ($char eq "*") {
			push @$mt, $_;
		} else {
			push @$ms, $_;
		}
	}
	return int($aln->percentage_identity());
}

# algorithm to search each group of organisms or all organisms grouped into one array
sub compare_sequences {
	my ($ref, $in_path, $out_path, $temp_t, $temp_o) = @_;

	# first we prepare the directory tree
	mkdir $out_path, 0755 unless -e  $out_path;

	my @types = @$temp_t;
	my @organisms = @$temp_o;

	# then we prepare the data
	prepare_data($in_path, $out_path, $ref, \@organisms, \@types);

	# finally, we make the calculations
	process_data($out_path, 80, "10+12",  1, @types);
}

# subroutine that initializes and prepares the files to be aligned from a dataset
sub prepare_data {
	my ($path, $out_path, $ref, $species, $types) = @_;
	$path = "" if $path eq "/";
	foreach (@$types) {
		mkdir $out_path."/".$_, 0755 unless -e  $out_path.$_;
		my $in  = Bio::SeqIO->new(-file => "<".$path.$_.$ref.".fa" , '-format' => 'Fasta');
		while (my $seq = $in->next_seq) {
			my $name = $seq->id();
			$name .= "i" if (-e $out_path.$_."/".$name.".fa");
			my $out  = Bio::SeqIO->newFh(-file =>  ">".$out_path."/".$_."/".$name.".fa" , '-format' => "Fasta");
			$seq->id("$ref");
			print $out $seq;
		}
	}
	foreach my $animal (@$species) {
		undef my %control;
		foreach (@$types) {
			my $in  = Bio::SeqIO->new(-file => "<".$path.$_.$animal.".fa" , '-format' => 'Fasta');
			while (my $seq = $in->next_seq) {
				my $name = $seq->id();
				$name .= "i" if ($control{$name} && $control{$name} == 1);
				my $out  = Bio::SeqIO->newFh(-file =>  ">>".$out_path."/".$_."/".$name.".fa" , '-format' => "Fasta");
				$seq->id("$animal");
				print $out $seq;
				$control{$name} = 1;
			}
		}
	}
}

# subroutine that makes the calculations using clustalw and gibbs
sub process_data {
	my($dir, $cs_threshold, $g_lengths, $g_cutoff, @types) = @_;

	foreach (@types) {
		my $nuc_aa = "aa";
		$nuc_aa = "n" unless $_ eq "CDS";

		my $directory .= $dir."/".$_."/";

		my @in_files = <$directory*>;

		foreach my $file (@in_files) {
			my @name = split /\./, $file;
			my $out = $name[0]."al.".$name[1];

			# ClustalW
			clustalw($file, "Fasta", $out);
			my $out_seq = $name[0]."cs.".$name[1];
			seek_align($out,$out_seq,$nuc_aa,$cs_threshold);	# gets consensus sequence

			# Gibbs
			my $g_path = '/usr/local/bin/gibbs';	# Your location of Gibbs
			my $expected = 4 if $nuc_aa eq "n";	# Omit if loooking for aa motifs
			gibbs($g_path,$file,$g_lengths,$nuc_aa,$expected,$g_cutoff);
		}
	}
}

1;
