#! /usr/bin/perl
# $Id: gmap_build.pl.in 69408 2012-07-20 21:22:20Z twu $

use warnings;	

$gmapdb = "/home/jourdren/home-net/eoulsan-mapper-compilation/gmap-build/share";
$package_version = "2012-07-20";

use File::Copy;	
use Getopt::Long;

Getopt::Long::Configure(qw(no_auto_abbrev no_ignore_case_always));


# Default values
$bindir = "/home/jourdren/home-net/eoulsan-mapper-compilation/gmap-build/bin";
$builddir = ".";
$sampling = 3;
$sleeptime = 2;

GetOptions(
    'B=s' => \$bindir,		# binary directory
    'T=s' => \$builddir,	# temporary build directory

    'D|dir=s' => \$destdir,	# destination directory
    'd|db=s' => \$dbname,	# genome name

    'M|mdfile=s' => \$mdfile,	# NCBI MD file

    'k|kmer=s' => \$kmersize, # k-mer size for genomic index (allowed: 16 or less)
    'b|basesize=s' => \$basesize, # offsetscomp basesize
    'q=s' => \$sampling,	   # sampling interval for genome (default: 3)

    's|sort=s' => \$sorting,	# Sorting
    'g|gunzip' => \$gunzipp,	# gunzip files
    'w=s' => \$sleeptime # waits (sleeps) this many seconds between steps.  Useful if there is a delay in the filesystem.
    );



if (!defined($kmersize)) {
    print STDERR "-k flag not specified, so building with default 15-mers and base size 12-mers\n";
    $kmersize = 15;
    $basesize = 12;
} elsif (!defined($basesize)) {
    if ($kmersize == 15) {
	print STDERR "-k flag specified as 15, but not -b, so building with basesize == 12\n";
	$basesize = 12;
    } else {
	print STDERR "-k flag specified (not as 15), but not -b, so building with basesize == kmer size\n";
	$basesize = $kmersize;
    }
}


if (!defined($dbname)) {
  print_usage();
  die "Must specify genome database name with -d flag.";
} elsif ($dbname =~ /(\S+)\/(\S+)/) {
  $dbname = $2;
  if (defined($destdir) && $destdir =~ /\S/) {
    $destdir = $destdir . "/" . $1;
  } else {
    $destdir = $1;
  }
}

$dbname =~ s/\/$//;	# In case user gives -d argument with trailing slash

if (!defined($destdir) || $destdir !~ /\S/) {
    $destdir = $gmapdb;
}

if (defined($sorting)) {
    $chr_order_flag = "-s $sorting";
} else {
    # Default is to order genomes
    print STDERR "Sorting chromosomes in chrom order.  To turn off or sort other ways, use the -s flag.\n";
    $chr_order_flag = "";
}

if (defined($gunzipp)) {
    $gunzip_flag = "-g";
} else {
    $gunzip_flag = "";
}


$genome_fasta = join(" ",@ARGV);



#####################################################################################

create_coords();
create_genome_version();
make_contig();
compress_genome();
create_index_offsets();
create_index_positions();
install_db();

exit;


#####################################################################################


sub create_coords {
    if (defined($mdfile)) {
	$cmd = "$bindir/md_coords -o $builddir/$dbname.coords $mdfile";
    } else {
	$cmd = "$bindir/fa_coords $gunzip_flag -o $builddir/$dbname.coords $genome_fasta";
    }
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($sleeptime);
    return;
}

sub check_coords_exists {
# file exists, and is not empty
    unless (-s "$builddir/$dbname.coords") {
	die "ERROR: $builddir/$dbname.coords not found.\n";
    }
    return;
}

sub create_genome_version {
    open GENOMEVERSIONFILE, ">$builddir/$dbname.version" or die $!;
    print GENOMEVERSIONFILE "$dbname\n";
    close GENOMEVERSIONFILE or die $!;
    sleep($sleeptime);
    return;
}

sub make_contig {
    check_coords_exists();
    $cmd = "$bindir/gmap_process $gunzip_flag -c $builddir/$dbname.coords $genome_fasta | $bindir/gmapindex -d $dbname -D $builddir -A $chr_order_flag";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($sleeptime);
    return;
}

sub compress_genome {
    $cmd = "$bindir/gmap_process $gunzip_flag -c $builddir/$dbname.coords $genome_fasta | $bindir/gmapindex -d $dbname -F $builddir -D $builddir -G";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($sleeptime);
    return;
}

sub full_ASCII_genome {
    check_coords_exists();
    make_contig();
	
    $cmd = "$bindir/gmap_process $gunzip_flag -c $builddir/$dbname.coords $genome_fasta | $bindir/gmapindex -d $dbname -F $builddir -D $builddir -l -G";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($sleeptime);
    return;
}

sub create_index_offsets {
    $cmd = "cat $builddir/$dbname.genomecomp | $bindir/gmapindex -b $basesize -k $kmersize -q $sampling -d $dbname -F $builddir -D $builddir -O";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($sleeptime);
    return;
}

sub create_index_positions {
    $cmd = "cat $builddir/$dbname.genomecomp | $bindir/gmapindex -b $basesize -k $kmersize -q $sampling -d $dbname -F $builddir -D $builddir -P";
    print STDERR "Running $cmd\n";
    if (($rc = system($cmd)) != 0) {
	die "$cmd failed with return code $rc";
    }
    sleep($sleeptime);
    return;
}

sub install_db {
    my @suffixes = (
	"chromosome", 
	"chromosome.iit", 
	"chrsubset", 
	"contig", 
	"contig.iit", 
	"genomecomp", 
	"version");
	
    if ($kmersize > $basesize) {
	push @suffixes,sprintf "ref%02d%02d%dgammaptrs",$basesize,$kmersize,$sampling;
    }
    push @suffixes,sprintf "ref%02d%02d%doffsetscomp",$basesize,$kmersize,$sampling;
    push @suffixes,sprintf "ref%02d%dpositions",$kmersize,$sampling;

    print STDERR "Copying files to directory $destdir/$dbname\n";
    system("mkdir -p $destdir/$dbname");
    system("mkdir -p $destdir/$dbname/$dbname.maps");
    system("chmod 755 $destdir/$dbname/$dbname.maps");
    foreach $suffix (@suffixes) {
	system("mv $builddir/$dbname.$suffix $destdir/$dbname/$dbname.$suffix");
	system("chmod 644 $destdir/$dbname/$dbname.$suffix");
    }

    system("rm -f $builddir/$dbname.coords");
    return;
}



sub print_usage {
  print <<TEXT1;

gmap_build: Builds a gmap database for a genome to be used by GMAP or GSNAP.
Part of GMAP package, version $package_version.

A simplified alternative to using the program gmap_setup, which creates a Makefile.

Usage: gmap_build [options...] -d <genomename> <fasta_files>

Options:
    -D, --dir=STRING        Destination directory for installation (defaults to gmapdb directory specified at configure time)
    -d, --db=STRING         Genome name
    -T STRING               Temporary build directory

    -M, --mdflag=STRING     Use MD file from NCBI for mapping contigs to chromosomal coordinates

    -k, --kmer=INT          k-mer value for genomic index (allowed: 16 or less, default is 15)
    -b, --basesize=INT      Basesize for offsetscomp (if kmer chosen and not 15, default is kmer; else default is 12)
    -q INT                  sampling interval for genomoe (allowed: 1-3, default 3)
    -s, --sort=STRING       Sort chromosomes using given method:
			      none - use chromosomes as found in FASTA file(s)
			      alpha - sort chromosomes alphabetically (chr10 before chr 1)
			      numeric-alpha - chr1, chr1U, chr2, chrM, chrU, chrX, chrY
			      chrom - chr1, chr2, chrM, chrX, chrY, chr1U, chrU
    -g, --gunzip            Files are gzipped, so need to gunzip each file first
    -w INT                  Wait (sleep) this many seconds after each step (default 2)

TEXT1
  return;
}

