#!/usr/bin/perl
#
# ARCT -- Aging Research Computational Tools 0.9 (beta)
#
# by Joao Pedro de Magalhaes (http://jp.senescence.info/contact.php)
#
# Copyright 2003 - 2005 by Joao Pedro de Magalhaes
#
# You may distribute this script under the same terms as perl itself
#
# IMPORTANT: I am not responsible for any damage this script may cause. Use at your own risk.
#
#
# This is an example script on how to use ARCT that includes several examples. Please understand what each subroutine does before running the script, since the script is not designed as a tutorial. Instead, what follows is a description and example of the main subroutines in ARCT. Examples are included but these should be run in another file since you don't want everything in this script to run. I also recommend you to take a look at the two example scripts (example1.pl and example2.pl) for pratical examples of ARCT's functions.

die "Welcome to ARCT\n\nPlease make sure you understand the code!\n";

### ARCT::Parser

# A key in ARCT is parser.pl, which is the module that includes all subroutines to parse files, access databases, and other miscellaneous routines.

use ARCT::Parser;

# One of the first things you might want to do is get some sequences from remote databases--if you already have the sequences you want to use in a file then feel free to skip this step. Whether you're looking for genes or proteins, Parser is the module to use. There are two ways of getting sequences:

# Common parameters
my $query = "NM_003362 NM_014311 NM_003925";	# A string with the accession numbers of the sequences you want to download
my $start = 1;	# In case you are downloading a large number of sequences there could be some problem and you'll have to restart, so here you can set to add to the already existing sequences
my $out_file = "testing.sq";	# The filename where to save the results
my $format = "EMBL";		# $out_file's format. EMBL and FASTA are my two favorite formats

# Then you can either retrieve one sequence at a time or use batch mode. Batch mode is recommended since it's faster and the people at the databases prefer it that way. You should only retrieve one sequence at a time if you have a few sequences and they are NOT to be retrieved from the same database
#
# To use batch mode you must select which database you want to search
my $db = 1;	# $db = 1 for RefSeq, -1 for GenPept, anything else for GenBank

# Then invoque the get_batch subroutine using the following syntax
get_batch ($start, $format, $out_file, $db, $query);

# To retrieve one sequence at a time, you just have to indicate whether you are downloading genes or proteins
my $db = 1;	# Set $db to 1 for retrieving genes and -1 for retrieving proteins

# Use the get_single subroutine exactly like get_batch, except it will only get one sequence
get_single($start, $format, $out_file, $db, $query);

# If you want to find the position of a specific gene in a large file containing sequences, you can use the parse_gene subroutine

my $in_file = "testing.sq";
my $in_format = "EMBL";
my $query = "DDB2";

parse_gene($in_file, $in_format, $query);

# Also included in Parser is a very simple promoter retriever. It needs two parameters:
my $filename = "wrn.fa";	# The output filename

# The sequences to retrieve. In this case the WRN promoter in, respectively, human and mouse genes
my @query = qw /NM_000553 NM_011721/;

# Then invoque get_promoter as follows:
get_promoter($filename, @query);		# Have in mind that the efficiency of get_promoter varies much from gene to gene and you are strongly advized to check the gene manually in the database to make sure get_promoter is getting the sequence it should. Often, to retrieve large upstream sequences (>1000 bp), you need to do it manually, at the UCSC, for example.

# A more powerful way to obtain promoter sequences is downloading sequences from EPD.
# For example, use:
# get_prom("NM_021081");
# This method is preferred and it is likely get_promoter will be eliminated in the future.

# Once you use the Profiles module, you are likely to refine your statistical analysis. In order to parse the report generated by the Profiles module, you should use report_parser

my $filename = "comparison.txt";		# Your only parameter is the filename, normally comparison.txt
my @ref = report_parser($filename);	# report_parser returns an array with all information regarding the reference genes and the corresponding hits. The reference and each of the hits are separated by "&" and the features in each hit are separated by ","

### ARCT::Profiles

# The Profiles module is the heart of ARCT in the sense it is used to build the phylogenetic profiles and get the accession numbers necessary for everything else to work. The main subroutine is blast_away, which is used to build the phylogenetic profile of some query sequences file as well as other statistics. Its usage is simpler than appears at first:

use ARCT::Profiles;

# Parameters for BLASTing from NCBI
my $in_file = "testing.sq";	# File containing the sequences to be blasted
my $in_format = "EMBL";	# Format of $in_file. EMBL is preferred
my $out_file = "Blast/test";	# Directory and filename where we'll save the BLAST reports. Note that the directory must exist
my $start = 1;	# The first sequence to be queried
my $end = 3;	# The last sequence to be queried
my @query_species = qw /man mouse fly worm ecoli/;		# The species we want to search, see the init_prof subroutine for a list of available species and how to add more species to the list

&blast_away ($in_file, $in_format, $out_file, $start, $end, @query_species);

# blast_away will BLAST each sequence in $in_file, retrieve each report from NCBI and save it to disk, scan each report for hits of each species in @query_species and then generate a phylogenetic profile (profile.txt) as well as a report (comparison.txt) with a more detailed comparison.

# Once you have the BLAST reports locally, you can use the blast_here subroutine. It is useful, for example, if you want to refine the parameters that define and orthologue. (At present these are declared within the process_result subroutine of the Profiles module. In the future, I hope to make the parameters easier to modify.) Anyway, to use blast_here is very similar to blast_away

my $in_file = "Blast/test";	# Directory and filename of BLAST reports
my $start = 1;
my $end = 3;
my @query_species = qw /man mouse fly worm ecoli/;

&blast_here ($in_file, $start, $end, @query_species);

# In order to refine your results, you should use the report_parser subroutine in conjunction with whatever statistical analysis you want. As an example, in Profiles we've included the find_orthologs subroutines designed to find 1:1 orthologs. To build a table with 1:1 orthologs of mouse and reference (assumed to be man) just type:

my $filename = "comparison.txt";
my @ref = report_parser($filename);

find_orthologs(mouse, @ref);		# find_orthologs takes as arguments the organism to search against man and an array generated by report_parser. The output is a simple text file (orthologs.txt); find_orthologs also saves the BLAST reports as orthl.txt in the current directory.

### ARCT::Clustal

# The Clustal module is ARCT's built-in access to ClustalW. Usage is very simple:

use ARCT::Clustal;

# Parameters
my $filename = "wrn.fa";	# A file with the sequences to be aligned
my $format = "Fasta";		# $filename's format
my $out_file = "testCLW.txt";		# The name of the file to create the alignment

# Then use the subroutine clustalw as follows
clustalw($filename, $format, $out_file);

# At present the Clustal module only aligns sequences in files but it is easy to include alignments between files with alignments and alignments between files with alignments and files with sequences.

### ARCT::TF

# The TF module includes several tools to research TF and TFBS.

#use ARCT::TF;

# The first step to perform is to create a local collection of matrices using the TF_db subroutine and TRANSFAC's remote database

# Parameters
my $db_name = "testTFBS";		# Name of directory to be created with the matrices
my @query = qw /V$ELK1_01 V$AP4_01 V$SP1_01/;	# TRANSFAC IDs

TF_db($db_name, @query);

# It is also possible to pass on a file to TF_db with TRANSFAC IDs. The file can contain othe words, but a TRANSFAC ID is only recognizable if it is the first readable character in a line--and don't put multiple IDs in a single line. Example:

my $filename = "hTRANSFAC.txt";
TF_db($db_name, $filename);

# Once we have a local database of matrices we can start working. Of vital importance is to know which TF we have in the database. To list all TF in the database, just use the TF_list subroutine

my $db_name = "testTFBS";		# Name of database
TF_list($db_name);

# Also available is a subroutine (TF_names) that returns an array with the names of all TF stored in the database

my @TF_names = TF_names(testTFBS);

# And the subroutine tf_max that lists all matrices for a given TF--since often we have more than one for a single TF

my $db_name = "testTFBS";
my $query = "Sp1";
tf_max($db_name, $query);

# Once you are happy with your list of TF, you can start using the real tool called TF_scan. In brief, TF_scan searches for TFBS and creates a graphical display of the results

# Parameters
my $db_name = "testTFBS";
my $filename = "wrn.fa";	# File in FASTA format with the sequences to be searched
my $out_dir = "TFtest";		# Directory where the results will be saved
my @names = qw/hWRN mWRN/;	# OPTIONAL: If you want, you can pass names to TF_scan. If you don't, then the sequence's IDs will be used as names

# To invoque TF_scan is simple
TF_scan($filename, $db_name, $out_dir, @names);

# TF_scan will scan all sequences in $filename creating an output file for each of them using @names or their accession numbers

# As present, TF_scan searches for a selection of 23 matrices we selected for their possible involvement in aging. To change that parameter, you must change TF.pm itself (the @matrices declaration, to be precise). In the future we will include an easier way to select which matrices to search for. Such bugs are one of the reasons why ARCT cannot be installed as a normal module yet.

### ARCT::Info

# The Info module creates an HTML report with hyperlinks to GenBank, GeneCards, and Medline. Its usage is very simple:

use ARCT::Info;

# Parameters
my $filename = "info.htm";	# Output file's name
my $med_search = "aging";		# Term to search in Medline
my @query = qw/p53 AP-1/;

info($filename, $med_search, @query);

# This module is particularly useful when used in combination with TF or Profiles. For example:
my @query = TF_names($database);	# Generates a file with hyperlinks to all TF in $database

# POD

=head1 NAME

ARCT - Aging Research Computational Tools 0.9 (beta)

=head1 DESCRIPTION

ARCT is a set of scripts and modules useful for comparative genomics applied to the study of human aging.

=head1 CONTENTS

ARCT is composed of the following files:

main.pl -- this file, which is a general script for running the other modules

Clustal.pm -- module to access ClustalW from within ARCT.

Gibbs.pm -- module to access Gibbs from within ARCT.

Info.pm -- module to build HTML files with links to various databases (e.g. PubMed, GenBank, RefSeq, MapViewer, OMIM, SwissProt, GeneCards, etc.).

Mining.pm -- module for data-mining, including algorithms and statistical analysis tools.

Parser.pm -- module to parse different files, access databases and generally glue the other modules together.

Profiles.pm -- module to generate phylogenetic profiles based on DNA or protein data.

TF.pm -- module to retrieve TFBS matrices, create a local TF matrix database, find putative TFBS, and display the results.

WWW.pm -- module which includes the subroutines for the graphical display of protein interactions, phylogeny, algorithm to find CpG islands and analyse sequences, etc.

example1.pl -- example script 1

example2.pl -- example script 2

=head1 FUTURE CHANGES

There are a few features I might add to ARCT in the future (in order of relevance):

- Get promoter region directly (e.g. from UCSC) just by having an accession number

- Use the taxonomy Bioperl modules to get evolutionary relationships built into ARCT

- Make ARCT easier to integrate with Bioperl by including object-oriented modules

- Incorporate some ideas from the Uniblast and inparanoid programs

=head1 FEEDBACK

I believe cooperation is at the basis of success. Please send comments, suggestions, and bug reports to the author.

=head1 AUTHOR

Joao Pedro de Magalhaes, Ph.D. E<lt>http://jp.senescence.info/contact.phpE<gt>

Dept. of Genetics, Harvard Medical School, 77 Ave. Louis Pasteur, Room 238, Boston, MA 02115, USA.

=head1 WEBSITE

http://genomics.senescence.info/software/

=head1 DISCLAIMER

I am not responsible for any damage any of these modules may cause. Use at your own risk.

=cut