"""Configuration file for performing analysis on rice duplicated pairs.

This runs on the fly BLASTs and analysis on both single and multi-processor
machines. Maximum likelihood is used for phylogenetic analyses and t-coffee is
used for alignments.
"""
import os
import logging

from BDBStorage.BerkeleyBase import BerkeleyConfig

from Scientific.BSP import ParData

from Bio.PGML.Dating.Retrieval import FastaFileRetrieval
from Bio.PGML.Dating.Duplicate import DuplicateOrganizer
from Bio.PGML.Utils.Index import FastaTitleIndexer, FastaGIIndexer
from Bio.PGML.Duplicate.Config import BerkeleyDatabaseRetriever
from Bio.PGML.Dating.Phylogeny import BootstrapMaximumLikelihood
from Bio.PGML.Dating.Alignment import TCoffeeAlignment
from Bio.PGML.Dating.BlastDistributed import DistributedBlast, \
        LongHitProcessor, TBlastnBlaster
from Bio.PGML.Dating.Date import PhylogenyProcessor

# -- set up logging -- we'll just print everything out
logging.basicConfig()
root = logging.getLogger()
root.setLevel(logging.DEBUG)
root.handlers[0].setFormatter(
  logging.Formatter("%(asctime)s [%(levelname)s] %(message)s"))

# --- useful constants
# base_dir = os.path.dirname(__file__)
#if not(base_dir):
#    base_dir = os.getcwd()
base_dir = os.getcwd()
logging.info("Working in directory %s" % base_dir)

pid = ParData(lambda pid, nprocs: pid)

blast_dir = os.path.join(base_dir, "blast")
index_dir = os.path.join(base_dir, "indexes")
fasta_dir = os.path.join(base_dir, "taxa_dbs")
analysis_dir = os.path.join(base_dir, "analysis", str(pid.value))
dup_file = os.path.join(base_dir, "rice", "rice_dup.fasta")
dup_blast_db = os.path.join(base_dir, "rice", "rice_dup.fasta")
blast_executable = "/opt/NCBI_BLAST/blastall"

files = {
  "Allium" : 	    os.path.join(fasta_dir, "all.fasta"),
  "Banana" :        os.path.join(fasta_dir, "banana_ests.tfa"),
  "Hordeum" :       os.path.join(fasta_dir, "hor.fasta"),
  "Oryza minuta" :  os.path.join(fasta_dir, "omn.fasta"),
  "Phycomitrella" : os.path.join(fasta_dir, "phy.fasta"),
  "Pinaceae" :      os.path.join(fasta_dir, "pin.fasta"),
  "Sorghum" :       os.path.join(fasta_dir, "sor.fasta")
  }

e_value_thresh = 1e-05

# --- useful functions

def genbank_match_parser(match):
    """Parse a GI number out of a standard GenBank header.
    """
    match_parts = match.split()
    id_parts = match_parts[0].split("|")
    return id_parts[1]

def easy_match_parser(match):
    """Just remove the > from the match name.
    """
    return match[1:]

def rice_title_to_dup_name(title):
    """Process stupid FASTA titles and return duplicate location information.

    This returns three things:
    1. Name of the event the duplicate is located in.
    2. Name of the duplicate block the duplicate is located in.
    3. Name of the duplicate pair.
    
    DxP0001aG#11888
    """
    event_info = {
     "rice_alpha" : ["Dx", "D1", "D2", "D3", "D4", "D5", "D6", "D7",
                     "D7", "D8", "D9"]
    }
    if title[0] == ">": # get rid of a FASTA thingy if we've got it
        title = title[1:]
    john_part, real_part = title.split("#")
    dup_name = john_part[:-2] # get rid of the letter and the 'G'
    dup_block, pair_name = dup_name.split("P")
    
    event_name = None
    for potential_event_name in event_info.keys():
        if dup_block in event_info[potential_event_name]:
            event_name = potential_event_name
    if event_name is None:
        raise ValueError("Could not place block: %s" % dup_block)

    return event_name, dup_block, pair_name

# --- Configuration information

def _get_distributed_blast(name, parser = None):
    if parser is None:
        parser = genbank_match_parser
    return DistributedBlast(dup_file, LongHitProcessor(name,
              TBlastnBlaster(database = files[name],
                  matrix = "BLOSUM62", e_value = e_value_thresh,
                  work_dir = analysis_dir, blastcmd = blast_executable),
              rice_title_to_dup_name, parser))

blasts = [
  _get_distributed_blast("Pinaceae"),
  _get_distributed_blast("Allium"),
  _get_distributed_blast("Banana", easy_match_parser),
  _get_distributed_blast("Sorghum"),
  _get_distributed_blast("Hordeum"),
  _get_distributed_blast("Oryza minuta"),
  _get_distributed_blast("Phycomitrella")
]

org_retrievers = {
  "Pinaceae" : FastaFileRetrieval(index_dir, "Pinaceae",
          os.path.join(fasta_dir, "pin.fasta"), FastaGIIndexer()),
  "Allium" : FastaFileRetrieval(index_dir, "Allium",
          os.path.join(fasta_dir, "all.fasta"), FastaGIIndexer()),
  "Sorghum" : FastaFileRetrieval(index_dir, "Sorghum",
          os.path.join(fasta_dir, "sor.fasta"), FastaGIIndexer()),
  "Hordeum" : FastaFileRetrieval(index_dir, "Hordeum",
          os.path.join(fasta_dir, "hor.fasta"), FastaGIIndexer()),
  "Oryza minuta" : FastaFileRetrieval(index_dir, "Oryza minuta",
          os.path.join(fasta_dir, "omn.fasta"), FastaGIIndexer()),
  "Phycomitrella" : FastaFileRetrieval(index_dir, "Phycomitrella",
          os.path.join(fasta_dir, "phy.fasta"), FastaGIIndexer()),
  "Banana" : FastaFileRetrieval(index_dir, "Banana",
          os.path.join(fasta_dir, "banana_ests.tfa"), FastaTitleIndexer())
  }

#analysis_orgs = ["Pinaceae", "Allium", "Banana", "Sorghum",
#                 "Hordeum", "Oryza minuta"]
analysis_orgs = ["Pinaceae", "Allium", "Banana", "Sorghum", "Hordeum"]
#no_stats_orgs = ["Allium", "Oryza minuta"]
no_stats_orgs = []
analysis_root = "Phycomitrella"

dup_organizer = DuplicateOrganizer(dup_file, dup_blast_db,
        rice_title_to_dup_name, analysis_dir)

# alignment and phylogeny values
minimum_align_length = 35
minimum_align_score = 52
minimum_bootstrap = 20.0
alignment = TCoffeeAlignment(analysis_dir)
phylogeny = BootstrapMaximumLikelihood(25, analysis_dir)
phylogeny_work = PhylogenyProcessor(alignment, phylogeny,
        minimum_align_length, minimum_align_score,
        analysis_dir)

# BerkeleyDB storage information
berkeley_config = BerkeleyConfig()
berkeley_config.logdir = os.path.join(base_dir, "bdb_log")
berkeley_config.frequency = 3600
db_retriever = BerkeleyDatabaseRetriever(
        database_dir = os.path.join(base_dir, "dating_db"),
        config = berkeley_config)
db_org = "Rice"
blast_db_name = "Duplicate_BLASTs"
dating_db_name = "Phylogeny_Dating"
