// =====================================================================================
// 
//       Filename:  ReleaseInterface.h
// 
//    Description:  The interface for the final user
// 
//        Version:  1.0
//        Created:  11/12/2009 08:57:00 AM
//       Revision:  none
//       Compiler:  g++
// 
//         Author:  Jianxing Feng (feeldead), feeldead@gmail.com
//        Company:  THU
// 
// =====================================================================================

#ifndef ReleaseInterface_H 
#define ReleaseInterface_H

#include <string>
#include <iostream>
#include "GenResult.h"
#include "DataProcessor.h"
#include "MishMash.h"
#include <vector>
#include "InstanceWriter.h"
#include "InstanceReader.h"
#include "IsoInferPE.h"
#include "ResultSummary.h"


#include "ToolBase.h"

using namespace std;

/*
 * =====================================================================================
 *        Class:  ReleaseInterface
 *  Description:  This class provides methods to operates and processes the data.
 * =====================================================================================
 */
class ReleaseInterface : public ToolBase
{
	public:
		/* ====================  LIFECYCLE     ======================================= */
		ReleaseInterface (){};                             /* constructor */

		/*
		 *--------------------------------------------------------------------------------------
		 *        Class:  ReleaseInterface 
		 *       Method:  Help
		 *  Description:  
		 *        Param:  
		 *       Return:
		 *--------------------------------------------------------------------------------------
		 */
		virtual
		void
		Help()
		{
			cout << endl;
			cout << "      ============================================================================ " << endl;
			cout << "          IsoInfer : Infer isoforms based on single-end, pairted-end, TSS/PAS and " << endl;
			cout << "                     exon-intron boundary information. " << endl;
			cout <<                                                   endl;
			cout << "          Author   : Jianxing Feng. feeldead@gmail.com" << endl;
			cout << "            Date   : Thu Nov 12 2009     " << endl;
			cout << "          Version  : 0.4                              " << endl;
			cout << "      ============================================================================ " << endl;
			cout <<                                                   endl;
			cout << "      USAGE:   IsoInfer <JOB> <PARAMETERS>          " << endl;
			cout <<                                endl;
			cout << "      JOBS : -------------------------------------------------" << endl;
			cout <<                                                          endl;
			cout << "          -h : This help" << endl; 
			cout <<                                                          endl;
			cout << "          -ext_junc_ref: Extract junction ref sequence. -rstart, -bound, -grange, -tsspas, -ref, -l, -c " << endl;
			cout << "               are required." << endl;
			cout <<                                                          endl;
			cout << "          -gen_instance: Generate instances of problem for IsoInfer. Expression level will be used to define " << endl;
			cout << "               expressed segments. A segment is expressed if the expression level on this segment is above the " << endl;
			cout << "               expression level specified by -noise. -bound, -grange, -tsspas, -l, -c, -m, -mj are required." << endl;
			cout << "               -pe_info is optional." << endl;
			cout <<                                                          endl;
			cout << "          -predict     : Infer isoforms provided the instances generated by -gen_instance. -ins, -conf_level," << endl;
			cout << "               -minexp, -mindup, -ps, -bpe, -bse are required." << endl;
			cout <<                                                      endl;
			cout <<                                                      endl;
			cout << "      PARAMETERS : -------------------------------------------------" << endl;
			cout <<                                                          endl;
			cout << "          -rstart <a number>: For job -ext_junc_ref, the parameter specifies the start position of the first " << endl;
			cout << "                neocliotide of a chromosome. This parameter is to make sure that the coordinations used in the " << endl;
			cout << "                program is consistent with the coordinations provided by -bound, -grange and -tsspas. Default 0." << endl;
			cout <<                                                          endl;
			cout << "          -bound <file> : Boundary file. The format of the file is : " << endl;
			cout << "                chromosome  strand  position  type"              << endl;
			cout << "                * Every two consecutive fields are separated by a single TAB." << endl;
			cout << "                * Each line corresponds to a boundary with a certain type." << endl;
			cout << "                * The position is always the position of the first base of an exon or intron." << endl;
			cout << "                * 'type' are binary. type = 0 for intron -> exon, type = 1 for exon -> intron." << endl;
			cout << "                * It is possible that a boundary is both type 0 and type 1. In this case, provide " << endl;
			cout << "                  two lines for this boundary, with one line for type 0 and another line for type 1." << endl;
			cout << "                * If type information is unavailable, set types as 0 for all the boundaries." << endl;
			cout <<                                                          endl;
			cout << "          -grange <file> : Gene range file. The format of the file is : " << endl;
			cout << "                 gene_name chromosome  strand  start_position end_position" << endl;
			cout << "                * Every two consecutive fields are separated by a single TAB." << endl;
			cout << "                * Each line corresponds to a gene." << endl;
			cout <<                                                          endl;
			cout << "          -tsspas <file> : TSS and PAS file. The format of the file is : " << endl;
			cout << "                    gene_name   TSSs   PASs " << endl;
			cout << "                * Every two consecutive fields are separated by a single TAB." << endl;
			cout << "                * gene_name should be consistent with the gene range file (specified by -grange)." << endl;
			cout << "                * TSSs or PASs are sepereted by commas. In each line, an isoform starting from one " << endl;
			cout << "                  element in TSSs must end with some element in PASs on the same line. " << endl;
			cout << "                * For a gene, multiple lines may be provided. There is no constraint on different lines." << endl;
			cout <<                                                          endl;
			cout << "          -ref <file> : Reference sequence in a single file." << endl; 
			cout <<                                                          endl;
			cout << "          -m <file> : A file containing the mapping information of short reads to the ref sequence." << endl;
			cout << "                The format of this file is:" << endl;
			cout << "                chromosome  strand  position"              << endl;
			cout << "                * Every two consecutive fields are separated by a single TAB." << endl;
			cout << "                * In each line, 'position' is the start position of the read mapping to the ref." << endl;
			cout <<                                                          endl;
			cout << "          -mj <file> : A file containing the mapping information of short reads to the junctions." << endl;
			cout << "                The format of this file is:" << endl;
			cout << "                chromosome  strand  position1 position2 shift"              << endl;
			cout << "                * Every two consecutive fields are separated by a single TAB." << endl;
			cout << "                * Each line should contain the information of a read mapping to a junction. 'position1' " << endl;
			cout << "                  is the start position of the first part of the junction reference sequence. 'position2'" << endl;
			cout << "                  is the start position of the second part of the junction reference sequence. This two " << endl;
			cout << "                  positions are contained in the output of -ext_junc_ref. Please see -o for more information." << endl;
			cout << "                  'shift' is the start position of the read mapping to the junction reference sequence." << endl;
			cout << "                  _____________________________________________________________________________________ " << endl;
			cout << "                 |+++++++  : read                                                                      |" << endl;
			cout << "                 |-------  : intron                                                                    |" << endl;
			cout << "                 |=======  : exon                                                                      |" << endl;
			cout << "                 |                              ++++++++++++++++++                                     |" << endl;
			cout << "                 |mapping  :                    |     | \\         \\                                    |" << endl;
			cout << "                 |                              |     |  \\         \\                                   |" << endl;
			cout << "                 |junc ref :     ----==================---===================---------                 |" << endl;
			cout << "                 |                   ^position1        ^position2                                      |" << endl;
			cout << "                 |                   [  shift   )                                                      |" << endl;
			cout << "                 |_____________________________________________________________________________________|" << endl;
			cout <<                                                          endl;
			cout << "          -l <a number> : The length of a read. Default 25"         << endl;
			cout <<                                                          endl;
			cout << "          -c <a number> : The cross strength to generate junction ref sequence. Default 1"         << endl;
			cout <<                                                          endl;
			cout << "          -s <T/F> : Whether the operations are strand specific or not? Default F"         << endl;
			cout <<                                                          endl;
			cout << "          -ins <file> : A file containing instances."         << endl;
			cout <<                                                          endl;
			cout << "          -noise <a number> : The noise level in RPKM. Segments below this expression level is treated " << endl;
			cout << "               as introns. Default 1." << endl;
			cout <<                                                          endl;
			cout << "          -pe_info <file> : A file of PE information. Each line of the file consists of 4 fields :" << endl;
			cout << "               PE_length span_mean span_std mapping_file." << endl;
			cout << "                * Every two consecutive fields are separated by a single TAB." << endl;
			cout << "                * If there are several group of PE reads with different length/span_mean/span_std," << endl;
			cout << "                  put them into different lines." << endl;
			cout << "                * Each line, corresponding to a PE read, of the mapping_file contains 10 fields:" << endl;
			cout << "                     [0]    chromosome" << endl;
			cout << "                     [1]    strand" << endl;
			cout << "                     [2]    mapping start position of the first part of the first end of the PE read" << endl;
			cout << "                     [3]    the length the first part of the first end of the PE read" << endl;
			cout << "                     [4]    mapping start position of the second part of the first end of the PE read" << endl;
			cout << "                     [5]    the length the second part of the first end of the PE read" << endl;
			cout << "                     [6]    mapping start position of the first part of the second end of the PE read" << endl;
			cout << "                     [7]    the length the first part of the second end of the PE read" << endl;
			cout << "                     [8]    mapping start position of the second part of the second end of the PE read" << endl;
			cout << "                     [9]    the length the second part of the second end of the PE read" << endl;
			cout <<                                                          endl;
			cout << "                  A schematic view of a PE read mapping to the genome is:" << endl;
			cout << "                  _____________________________________________________________________________________ " << endl;
			cout << "                 |+++++++  : read                                                                      |" << endl;
			cout << "                 |~~~~~~~  : gap                                                                       |" << endl;
			cout << "                 |-------  : intron                                                                    |" << endl;
			cout << "                 |=======  : exon                                                                      |" << endl;
			cout << "                 |                  first end                                  second end              |" << endl;
			cout << "                 |PE read:          +++++++++~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~++++++++               |" << endl;
			cout << "                 |mapping:         /   /  \\  \\                                 /   /  \\ \\              |" << endl;
			cout << "                 |                /   /    \\  \\                               /   /    \\ \\             |" << endl;
			cout << "                 |refseq: ----========------====----================--------======------===============|" << endl;
			cout << "                 |               ^                 : [2]                                               |" << endl;
			cout << "                 |               [----)            : [3]                                               |" << endl;
			cout << "                 |                          ^      : [4]                                               |" << endl;
			cout << "                 |                          [---)  : [5]                                               |" << endl;
			cout << "                 |_____________________________________________________________________________________|" << endl;
			cout <<                                                          endl;
			cout << "                  If an end of a PE read does not map to a junction, only the first part information" << endl;
			cout << "                  is useful and the start position of the second part should be -1." << endl;
			cout <<                                                          endl;
			cout << "          -conf_level <a number in [0,1]>: Set the confidence level. Default 0.05." << endl;
			cout <<                                                          endl;
			cout << "          -bpe <T/F> : Use paired-end information or not. Default F. When paired-end reads are not available," << endl;
			cout << "                set this parameter to be F." << endl;
			cout <<                                                          endl;
			cout << "          -bse <T/F> : Use TSS/PAS information or not. Default T." << endl;
			cout <<                                                          endl;
			cout << "          -minexp <a number> : The minimum expression level. Default 1. This parameter is effective when" << endl;
			cout << "                paired-end reads are available." << endl;
			cout <<                                                          endl;
			cout << "          -mindup <a number> : The minimum effective duplication of part comb. Default 1. This parameter is " << endl;
			cout << "                effective when paired-end reads are available." << endl;
			cout <<                                                          endl;
			cout << "          -ps <a number> : Partition size. Default 7. On whole mouse genome, the isoform inference process " << endl;
			cout << "                (Step4 in the following example) costs about 10 minutes on a standard PC with this default " << endl;
			cout << "                parameter. A larger value is supposed to lead to better results." << endl;
			cout <<                                                          endl;
			cout << "          -o <file> : A file for output." << endl;
			cout << "                * For job -ext_junc_ref, each junction forms two consecutive lines in the file. The first " << endl;
			cout << "                  line is the junction ID which is in the form of:                            " << endl;
			cout << "                       >chromosome|position1|position2|cross_len|Junc                         " << endl;
			cout << "                  The following schematic graph defines position1 and position2 for a junction" << endl;
			cout << "                  _____________________________________________________________________________________ " << endl;
			cout << "                 |-------  : intron                                                                    |" << endl;
			cout << "                 |=======  : exon                                                                      |" << endl;
			cout << "                 |L = cross_len = read_length - cross_strength                                         |" << endl;
			cout << "                 |                   [-------L------)   [-------L------)                               |" << endl;
			cout << "                 |refseq:     ----==================----======================---------                |" << endl;
			cout << "                 |                   ^position1         ^position2                                     |" << endl;
			cout << "                 |_____________________________________________________________________________________|" << endl;
			cout <<                                                          endl;
			cout << "                  The second line is the concatenation of sequences [position1, position1+cross_len) "  << endl;
			cout << "                  and [position2, position2+cross_len)                                               "  << endl;
			cout <<                                                          endl;
			cout << "                * For job -gen_instance, it OK to treat the output as a black box :-)" << endl;
			cout << "                * For job -predict, each line in the output is an predicted isoforms in a format similar" << endl;
			cout << "                  to UCSC known genes:" << endl;
			cout << "                 ID chromosome strand start_position end_position exon_start_positions exon_end_positions" << endl;
			cout <<                                                          endl;
			cout << "      EXAMPLES: -------------------------------------------------" << endl;
			cout <<                                                          endl;
			cout << "           Step1 : Extract junction reference sequence strand specifically with -rstart and -c being default" << endl;
			cout <<                                                          endl;
			cout << "                isoinfer -ext_junc_ref -bound mybound -grange myrange -tsspas mytsspas -ref refseq -l 30 -s T -o juncref" << endl;
			cout <<                                                          endl;
			cout << "           Step2 : Use Bowtie or other mapping tools to map the short reads to the reference sequence and junction" << endl;
			cout << "                reference sequence. Preparing the input for the next step" << endl;
			cout <<                                                          endl;
			cout << "           Step3 : Generate instances for IsoInfer without paired-end information." << endl;
			cout <<                                                          endl;
			cout << "                isoinfer -gen_instance -bound mybound -grange myrange -tsspas mytsspas -l 30 -s T \\" << endl; 
			cout << "                         -m mymap -mj myjuncmap -o myins" << endl;
			cout <<                                                          endl;
			cout << "           Step4 : Predict isoforms given input 'myins' without paired-end information. Set -minexp to 2, -ps to 8" << endl;
			cout << "                and all other parameters to default." << endl;
			cout <<                                                          endl;
			cout << "                isoinfer -predict -ins myins -minexp 2 -ps 8 -bse T -bpe F" << endl;
			cout <<                                                          endl;
			cout << "           Note : Parameters (e.g. -l, -c, -s) should be consistent in the four steps." << endl;
			cout << endl;
		}

		/*
		 *--------------------------------------------------------------------------------------
		 *        Class:  ReleaseInterface 
		 *       Method:  Main
		 *  Description:  
		 *        Param:  
		 *       Return:
		 *--------------------------------------------------------------------------------------
		 */
		virtual
		int 
		Main(int argc, char* argv[], int startArg)
		{
			ofstream* p_output;

			string output_file = "";
			string knownGene_file = "";
			string ins_file = "";
			string refseq_file = "";
			string map_file = "";
			string map_junc_file = "";
			string pe_info_file = "";
			string bound_file = "";
			string grange_file = "";
			string tsspas_file = "";

			int read_len = 25;
			int start_pos_of_first_nt = 0;
			double conf_level = 0.05;
			int cross_strength = 5;
			double noise_level = 1;
			double min_exp = 1;
			int min_dup = 1;
			int part_size = 7;
			int output_format = 1;

			bool b_help = true;
			bool b_predict = false;
			bool b_ext_junc_ref = false;
			bool b_gen_instance = false;

			bool b_pe_enable = false;
			bool b_se_enable = true;
			bool b_strand_specific = false;


			for (int i = startArg; i < argc; i++)
			{
				if (strcmp(argv[i], "-h") == 0)
				{
					Help();
					return 0;
				}

				if (strcmp(argv[i], "-ext_junc_ref") == 0)
					b_ext_junc_ref = true;
				else if (strcmp(argv[i], "-gen_instance") == 0)
					b_gen_instance = true;
				else if (strcmp(argv[i], "-predict") == 0)
					b_predict = true;
				else if (strcmp(argv[i], "-ref") == 0)
					refseq_file = argv[++i];
				else if (strcmp(argv[i], "-m") == 0)
					map_file = argv[++i];
				else if (strcmp(argv[i], "-mj") == 0)
					map_junc_file = argv[++i];
				else if (strcmp(argv[i], "-o") == 0)
					output_file = argv[++i];
				else if (strcmp(argv[i], "-l") == 0)
					read_len = atoi(argv[++i]);
				else if (strcmp(argv[i], "-c") == 0)
					cross_strength = atoi(argv[++i]);
				else if (strcmp(argv[i], "-s") == 0)
					b_strand_specific = (argv[++i][0] == 'T');
				else if (strcmp(argv[i], "-ins") == 0)
					ins_file = argv[++i];
				else if (strcmp(argv[i], "-pe_info") == 0)
					pe_info_file = argv[++i];
				else if (strcmp(argv[i], "-rstart") == 0)
					start_pos_of_first_nt = atoi(argv[++i]);
				else if (strcmp(argv[i], "-noise") == 0)
					noise_level = atof(argv[++i]);
				else if (strcmp(argv[i], "-bound") == 0)
					bound_file = argv[++i];
				else if (strcmp(argv[i], "-grange") == 0)
					grange_file = argv[++i];
				else if (strcmp(argv[i], "-tsspas") == 0)
					tsspas_file = argv[++i];

				else if (strcmp(argv[i], "-conf_level") == 0)
					conf_level = atof(argv[++i]);
				else if (strcmp(argv[i], "-bpe") == 0)
					b_pe_enable = (argv[++i][0] == 'T');
				else if (strcmp(argv[i], "-bse") == 0)
					b_se_enable = (argv[++i][0] == 'T');
				else if (strcmp(argv[i], "-minexp") == 0)
					min_exp = atof(argv[++i]);
				else if (strcmp(argv[i], "-mindup") == 0)
					min_dup = atoi(argv[++i]);
				else if (strcmp(argv[i], "-ps") == 0)
					part_size = atoi(argv[++i]);
				// This parameter is hidden from the user
				else if (strcmp(argv[i], "-oformat") == 0)
					output_format = atoi(argv[++i]);
				else 
				{
					Help();
					cerr << "Wrong parameter " << argv[i] << endl;
					exit(0);
				}
			}

			if (!b_ext_junc_ref && !b_gen_instance && !b_predict)
			{
				Help();
				ArgMissing("Job");
				exit(0);
			}

			if (output_file== "") {Help(); ArgMissing("-o"); exit(0);}

			// Prepare output
			if (output_file != "")
			{
				p_output = new ofstream;

				((ofstream*)p_output)->open(output_file.data(), ios::out);
				if (!((ofstream*)p_output)->is_open())
				{
					cerr << "File " << output_file.data() << " can not be opened" << endl;
					return false;
				}
			}

			if (b_ext_junc_ref)
			{
				if (bound_file == "") {Help(); ArgMissing("-bound"); exit(0);}
				if (grange_file == "") { Help(); ArgMissing("-grange"); exit(0);}
				if (tsspas_file == "") { Help(); ArgMissing("-tsspas"); exit(0);}

				map_str2vec_gene genes;
				map_str2vec_exon exons;
				map_str2vec_int  gene_color;
				map_str2vec_int  exon_color;

				DataProcessor dp;
				dp.mReadLen = read_len;
				dp.mCrossStrength = cross_strength;
				dp.mbStrandSpecific = b_strand_specific;
				dp.mpOutput = p_output;

				if (!dp.LoadGeneAndExon(bound_file, grange_file, tsspas_file, genes, exons)) return 1;
				dp.GroupGeneAndExon(genes, gene_color, exons, exon_color);
				dp.ExtractJunctionRef(exons, exon_color, genes, gene_color, refseq_file, start_pos_of_first_nt);
			}
			else if (b_gen_instance)
			{
				if (bound_file == "") {Help(); ArgMissing("-bound"); exit(0);}
				if (grange_file == "") { Help(); ArgMissing("-grange"); exit(0);}
				if (tsspas_file == "") { Help(); ArgMissing("-tsspas"); exit(0);}
				if (map_file == "") { Help(); ArgMissing("-m"); exit(0);}
				if (map_junc_file == "") { Help(); ArgMissing("-mj"); exit(0);}

				map_str2vec_gene genes;
				map_str2vec_exon exons;
				map_str2vec_int  gene_color;
				map_str2vec_int  exon_color;

				DataProcessor dp;
				dp.mReadLen = read_len;
				dp.mCrossStrength = cross_strength;
				dp.mbStrandSpecific = b_strand_specific;
				dp.mpOutput = p_output;

				if (!dp.LoadGeneAndExon(bound_file, grange_file, tsspas_file, genes, exons)) return 1;
				dp.GroupGeneAndExon(genes, gene_color, exons, exon_color);

				map<string, map_64_double> junc_counts;
				if (!dp.LoadJunctionReads(map_junc_file, exons, junc_counts)) return 1;

				int mapped_cnt = 0;
				if (!dp.LoadNonJunctionReads(map_file, exons, junc_counts, mapped_cnt)) return 1;

				vector<Instance> all_instances;

				if (pe_info_file != "")
				{
					vector<PEInfo> pe_infos;
					vector<string> mapping_files;

					dp.LoadPEInfo(pe_info_file, pe_infos, mapping_files);
					dp.LoadPEReads(pe_infos, mapping_files, all_instances);
				}

				dp.ExtractInstances(genes, gene_color, exons, exon_color, junc_counts, all_instances, false, noise_level);

				InstanceWriter writer(p_output);
				for (unsigned i = 0; i < all_instances.size(); i++)
					writer.OnInstance(all_instances[i]);
			}
			else if (b_predict)
			{
				if (ins_file == "") {Help(); ArgMissing("-ins"); exit(0);}

				LPsolver lp_solver;

				IsoInferPE iso_infer(&lp_solver, p_output);
				iso_infer.SetConfidencelevel(conf_level);
				iso_infer.EnableSE(b_se_enable);
				iso_infer.EnablePE(b_pe_enable);
				iso_infer.EnableStepII(true);
				iso_infer.SetMinExpLevel(min_exp);
				iso_infer.SetMinEffectivePartCombDup(min_dup);
				iso_infer.SetPartitionSize(part_size);

				ResultSummary summary(&iso_infer, p_output);
				summary.SetMinIsoCnt(0);
				summary.SetIsoCntScale(10);
				summary.SetOutputFormat(output_format); 

				InstanceReader reader(ins_file);
				reader.SetHandler(&summary);

				reader.Initialize();
				reader.Generate();
				reader.CleanUp();
			}

			if (output_file != "")
			{
				((ofstream*)p_output)->close();
				delete p_output;
			}
			return 0;
		}
}; /* -----  end of class ReleaseInterface  ----- */

#endif
