/*
 * =====================================================================================
 *
 *       Filename:  DataProcessor.cc
 *
 *    Description:  The implementation of class DataProcessor
 *
 *        Version:  1.0
 *        Created:  04/17/2009 04:38:40 PM
 *       Revision:  none
 *       Compiler:  gcc
 *
 *         Author:  Jianxing Feng (feeldead), feeldead@gmail.com
 *        Company:  THU
 *
 * =====================================================================================
 */

#include "DataProcessor.h"
#include "InstanceWriter.h"
#include "InstanceReader.h"
#include "MishMash.h"
#include <cassert>
#include <algorithm>
#include <string.h>

/*
 *--------------------------------------------------------------------------------------
 *       Class:  DataProcessor
 *      Method:  DataProcessor
 * Description:  constructor
 *--------------------------------------------------------------------------------------
 */
DataProcessor::DataProcessor ()
{
}  /* -----  end of method DataProcessor::DataProcessor  (constructor)  ----- */

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  Help
 *  Description:  
 *        Param:  
 *       Return:
 *--------------------------------------------------------------------------------------
 */
/*virtual*/
void
DataProcessor::Help()
{
	cout << endl;
	cout << "      ============================================================================ " << endl;
	cout << "          DataProcessor" << endl;
	cout << "      ---------------------------------------------------------------------------- " << endl;
	cout << endl;
	cout << "      This sub-program operates and prepare all the data" << endl;
	cout <<                                                   endl;
	cout <<                                                   endl;
	cout << "      USAGE:             " << endl;
	cout <<                                endl;
	cout << "--------Jobs : -------------------------------------------------" << endl;
	cout <<                                                          endl;
	cout << "          -ext_junc_ref <a number>: Extract junction ref sequence. The parameter is the start position " << endl;
	cout << "               of the first neocliotide of a chromosome" << endl;
	cout << "               -bound, -grange, -tsspas, -r, -l, -c are required" << endl;
	cout <<                                                          endl;
	cout << "          -ext_known_junc_ref <a number>: Extract junction ref sequence. The junctions will be extracted" << endl;
	cout << "               from the file specified by -grange. The parameter is the start position of the first " << endl;
	cout << "               neocliotide of a chromosome -grange, -r, -l, -c are required" << endl;
	cout <<                                                          endl;
	cout << "          -gen_instance <T/F>: Generate instances of problem for IsoInfer. If T is specified and the exons" << endl;
	cout << "               of each isoform are provided, expressed segments are extracted from the provided exons. " << endl;
	cout << "               Otherwise, expression level will be used to define expressed segments. A segment is expressed," << endl;
	cout << "               if the expression level on this segment is above the expression level specified by -noise" << endl;
	cout << "               -bound, -grange, -tsspas, -l, -c, -m, -mj are required" << endl;
	cout <<                                                          endl;
	cout << "          -high_isoform <T/F>: Given a set of isoforms (-grange), output max expression level of valid isoforms. " << endl;
	cout << "               T : check the expression level of the first and end exons. Otherwise, F. -grange, -l, -c, -m, -mj " << endl;
	cout << "               are required" << endl;
	cout <<                                                          endl;
	cout << "          -group_isoform: Group isoforms. -bound, -grange, -tsspas, should be specified." << endl;
	cout <<                                                      endl;
	cout << "          -gen_pe_reads <a number>: Generate a specified number of random paired-end reads. -ins, " << endl;
	cout << "               -pe_info should be specified. Note that the expression levels in the file specified by " << endl;
	cout << "               -ins will be used." << endl;
	cout <<                                                      endl;
	cout << "          -app_pe_info : Given an instance file, append PE information. -ins, -pe_info will be used." << endl;
	cout << "               The PE reads in files specified in -pe_info will be used." << endl;
	cout <<                                                      endl;
	cout << "--------Parameters : -------------------------------------------------" << endl;
	cout <<                                                          endl;
	cout << "          -h : This help" << endl; 
	cout <<                                                          endl;
	cout << "          -o <file> : A file for output" << endl;
	cout <<                                                          endl;
	cout << "          -bound <file> : Boundary file. The format of the file is : " << endl;
 	cout << "                chromosome  strand  position  type"              << endl;
 	cout << "                The position is always the position of the first base of an exon or intron." << endl;
 	cout << "                type = 0 for intron -> exon, type = 1 for exon -> intron." << endl;
	cout <<                                                          endl;
	cout << "          -grange <file> : Gene range file. The format of the file is : " << endl;
 	cout << "                 name chromosome  strand  start_position end_position exon_start_positions, exon_end_positions " << endl;
 	cout << "                 Exon start/end positions are sepereted by comma." << endl;
	cout <<                                                          endl;
	cout << "          -tsspas <file> : TSS and PAS file. The format of the file is : " << endl;
	cout << "                    gene_name   TSSs   PASs " << endl;
 	cout << "                    TSSs or PASs are sepereted by comma. In each line, an isoform " << endl;
	cout << "                    starting from one element in TSSs must end with some element in " << endl;
	cout << "                    PASs. There is no constraint on different lines. " << endl;
	cout <<                                                          endl;
	cout << "          -r <file> : RefSeq" << endl; 
	cout <<                                                          endl;
	cout << "          -m <file> : A file contains the mapping information of short reads to the ref sequence" << endl;
	cout <<                                                          endl;
	cout << "          -mj <file> : A file contains the mapping information of short reads to the junctions" << endl;
	cout <<                                                          endl;
	cout << "          -l <a number> : The length of a read. Default 25"         << endl;
	cout <<                                                          endl;
	cout << "          -c <a number> : The cross strength to generate junction ref sequence. Default 1"         << endl;
	cout <<                                                          endl;
	cout << "          -s <T/F> : Whether are the operations strand specific or not? Default F"         << endl;
	cout <<                                                          endl;
	cout << "          -ins <file> : A file contains instances."         << endl;
	cout <<                                                          endl;
	cout << "          -pe_info <file> : A file of PE information. Each line of the file consists of 6 fields :" << endl;
	cout << "               [0] : The length of the PE reads." << endl;
	cout << "               [1] : The mean of the span" << endl;
	cout << "               [2] : The std of the span" << endl;
	cout << "               [3] : The file containing the mapping of PE reads to the ref sequences" << endl;
	cout <<                                                          endl;
	cout << "          -noise <a number> : The noise level in RPKM. Segments below this expression level (in RPKM) " << endl;
	cout << "               is treated as intron. Default 1." << endl;
	cout << endl;

	exit(0);
}

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  Main
 *  Description:  
 *        Param:  
 *       Return:
 *--------------------------------------------------------------------------------------
 */
/*virtual*/
int 
DataProcessor::Main(int argc, char* argv[], int startArg)
{
	string output_file = "";
	string knownGene_file = "";
	string ins_file = "";
	string refseq_file = "";
	string map_file = "";
	string map_junc_file = "";
	string pe_info_file = "";
	string bound_file = "";
	string grange_file = "";
	string tsspas_file = "";

	mReadLen = 25;
	mCrossStrength = 1;
	mbStrandSpecific = false;
	int pe_read_cnt = 10000000;
	double noise_level = 1.0;
	int start_pos_of_first_nt = 0;

	bool b_ext_junc_ref = false;
	bool b_ext_known_junc_ref = false;
	bool b_gen_instance = false;
	bool b_group_isoform= false;
	bool b_ext_exon     = false;
	bool b_gen_pe_reads = false;
	bool b_app_pe_info  = false;
	bool b_use_provided_exons = false;
	bool b_high_isoform = false;
	bool b_check_start_end = true;

	for (int i = startArg; i < argc; i++)
	{
		if (strcmp(argv[i], "-h") == 0)
		{
			Help();
			return 0;
		}

		else if (strcmp(argv[i], "-g") == 0)
			knownGene_file = argv[++i];
		else if (strcmp(argv[i], "-r") == 0)
			refseq_file = argv[++i];
		else if (strcmp(argv[i], "-m") == 0)
			map_file = argv[++i];
		else if (strcmp(argv[i], "-mj") == 0)
			map_junc_file = argv[++i];
		else if (strcmp(argv[i], "-o") == 0)
			output_file = argv[++i];
		else if (strcmp(argv[i], "-l") == 0)
			mReadLen = atoi(argv[++i]);
		else if (strcmp(argv[i], "-c") == 0)
			mCrossStrength = atoi(argv[++i]);
		else if (strcmp(argv[i], "-s") == 0)
			mbStrandSpecific = (argv[++i][0] == 'T');
		else if (strcmp(argv[i], "-ins") == 0)
			ins_file = argv[++i];
		else if (strcmp(argv[i], "-pe_info") == 0)
			pe_info_file = argv[++i];
		else if (strcmp(argv[i], "-high_isoform") == 0)
		{
			b_high_isoform = true;
			b_check_start_end = (argv[++i][0] == 'T');
		}
		else if (strcmp(argv[i], "-ext_junc_ref") == 0)
		{
			b_ext_junc_ref = true;
			start_pos_of_first_nt = atoi(argv[++i]);
		}
		else if (strcmp(argv[i], "-ext_known_junc_ref") == 0)
		{
			b_ext_known_junc_ref = true;
			start_pos_of_first_nt = atoi(argv[++i]);
		}
		else if (strcmp(argv[i], "-gen_instance") == 0)
		{
			b_use_provided_exons = (argv[++i][0] == 'T');
			b_gen_instance = true;
		}
		else if (strcmp(argv[i], "-group_isoform") == 0)
			b_group_isoform= true;
		else if (strcmp(argv[i], "-ext_exon") == 0)
			b_ext_exon = true;
		else if (strcmp(argv[i], "-gen_pe_reads") == 0)
		{
			b_gen_pe_reads = true;
			pe_read_cnt = atoi(argv[++i]);
		}
		else if (strcmp(argv[i], "-noise") == 0)
			noise_level = atof(argv[++i]);
		else if (strcmp(argv[i], "-app_pe_info") == 0)
			b_app_pe_info = true;
		else if (strcmp(argv[i], "-bound") == 0)
			bound_file = argv[++i];
		else if (strcmp(argv[i], "-grange") == 0)
			grange_file = argv[++i];
		else if (strcmp(argv[i], "-tsspas") == 0)
			tsspas_file = argv[++i];
		else 
		{
			cerr << "Wrong parameter " << argv[i] << endl;
			Help();
		}
	}

	if (!b_ext_junc_ref && !b_ext_known_junc_ref && !b_gen_instance && 
		!b_group_isoform && !b_ext_exon && !b_gen_pe_reads && !b_app_pe_info &&
		!b_high_isoform)
	{
		ArgMissing("Job is missing");
		Help();
	}

	// Prepare output
	if (output_file != "")
	{
		mpOutput = new ofstream;

		((ofstream*)mpOutput)->open(output_file.data(), ios::out);
		if (!((ofstream*)mpOutput)->is_open())
		{
			cerr << "File " << output_file.data() << " can not be opened" << endl;
			return false;
		}
	}
	else
		mpOutput = &cout;

	if (b_ext_junc_ref)
	{
		if (bound_file == "") {ArgMissing("-bound"); Help();}
		if (grange_file == "") { ArgMissing("-grange"); Help(); }
		if (tsspas_file == "") { ArgMissing("-tsspas"); Help(); }

		map_str2vec_gene genes;
		map_str2vec_exon exons;
		map_str2vec_int  gene_color;
		map_str2vec_int  exon_color;

		if (!LoadGeneAndExon(bound_file, grange_file, tsspas_file, genes, exons)) return 1;
		GroupGeneAndExon(genes, gene_color, exons, exon_color);
		ExtractJunctionRef(exons, exon_color, genes, gene_color, refseq_file, start_pos_of_first_nt);
	}
	if (b_ext_known_junc_ref)
	{
		if (grange_file == "") { ArgMissing("-grange"); Help(); }

		ExtractJunctionRef(grange_file, refseq_file, start_pos_of_first_nt);
	}
	else if (b_gen_instance)
	{
		if (bound_file == "") {ArgMissing("-bound"); Help();}
		if (grange_file == "") { ArgMissing("-grange"); Help(); }
		if (tsspas_file == "") { ArgMissing("-tsspas"); Help(); }

		map_str2vec_gene genes;
		map_str2vec_exon exons;
		map_str2vec_int  gene_color;
		map_str2vec_int  exon_color;

		if (!LoadGeneAndExon(bound_file, grange_file, tsspas_file, genes, exons)) return 1;
		GroupGeneAndExon(genes, gene_color, exons, exon_color);

		map<string, map_64_double> junc_counts;
		if (map_junc_file != "") !LoadJunctionReads(map_junc_file, exons, junc_counts);

		int mapped_cnt = 0;
		if (map_file != "") LoadNonJunctionReads(map_file, exons, junc_counts, mapped_cnt);

		vector<Instance> all_instances;

		if (pe_info_file != "")
		{
			vector<PEInfo> pe_infos;
			vector<string> mapping_files;

			LoadPEInfo(pe_info_file, pe_infos, mapping_files);
			LoadPEReads(pe_infos, mapping_files, all_instances);
		}

		ExtractInstances(genes, gene_color, exons, exon_color, junc_counts, all_instances, b_use_provided_exons, noise_level);

		InstanceWriter writer(mpOutput);
		for (unsigned i = 0; i < all_instances.size(); i++)
			writer.OnInstance(all_instances[i]);
	}
	else if (b_high_isoform)
	{
		if (grange_file == "") { ArgMissing("-grange"); Help(); }

		map_str2vec_gene genes;
		map_str2vec_exon exons;
		map_str2vec_int  gene_color;
		map_str2vec_int  exon_color;

		if (!LoadGeneAndExon(grange_file, genes, exons)) return 1;
		GroupGeneAndExon(genes, gene_color, exons, exon_color);

		map<string, map_64_double> junc_counts;
		if (map_junc_file != "") LoadJunctionReads(map_junc_file, exons, junc_counts);

		int mapped_cnt = 0;
		if (map_file != "") LoadNonJunctionReads(map_file, exons, junc_counts, mapped_cnt);

		HighIsoforms(genes, gene_color, exons, exon_color, junc_counts, b_check_start_end);
	}
	else if (b_app_pe_info)
	{
		if (ins_file == "") {ArgMissing("-ins"); Help();}

		InstanceReader reader(ins_file);
		vector<Instance> all_instances;
		reader.ReadAllInstances(all_instances);

		vector<PEInfo> pe_infos;
		vector<string> mapping_files;

		LoadPEInfo(pe_info_file, pe_infos, mapping_files);
		LoadPEReads(pe_infos, mapping_files, all_instances);

		InstanceWriter writer(mpOutput);
		for (unsigned i = 0; i < all_instances.size(); i++)
			writer.OnInstance(all_instances[i]);
	}
	else if (b_group_isoform)
	{
		if (bound_file == "") {ArgMissing("-bound"); Help();}
		if (grange_file == "") { ArgMissing("-grange"); Help(); }
		if (tsspas_file == "") { ArgMissing("-tsspas"); Help(); }

		map_str2vec_gene genes;
		map_str2vec_exon exons;
		map_str2vec_int  gene_color;
		map_str2vec_int  exon_color;

		if (!LoadGeneAndExon(bound_file, grange_file, tsspas_file, genes, exons)) return 1;
		GroupGeneAndExon(genes, gene_color, exons, exon_color);

		for_each_ele_in_group(iter, map_str2vec_gene, genes)
		{
			vector<Gene>& curr_genes = iter->second;
			vector<int>& curr_gene_colors = gene_color[iter->first];

			for (unsigned i = 0; i < curr_genes.size(); i++)
				(*mpOutput) << curr_genes[i].mName << "\t" << curr_gene_colors[i] << "\t" << i << endl;
		}
	}
	else if (b_gen_pe_reads)
	{
		vector<PEInfo> pe_infos;
		vector<string> mapping_files;
		LoadPEInfo(pe_info_file, pe_infos, mapping_files);
		pe_infos[0].mReadCnt = pe_read_cnt;

		RandomExpReadAssignerIM semi_rand_gen(mpOutput);
		semi_rand_gen.AddPEInfo(pe_infos[0]);

		InstanceReader reader(ins_file);
		reader.SetHandler(&semi_rand_gen);

		reader.Initialize();
		reader.Generate();
		reader.CleanUp();
	}

	if (output_file != "")
	{
		((ofstream*)mpOutput)->close();
		delete mpOutput;
	}
	return 0;
}

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  ExtractExon
 *  Description:  Extract all the exons from the file. Overlaped exons will be divided.
 *        Param:  from_file :  The format of the file is the format of knownGene table 
 *                             from UCSC genome browser
 *                exons[out]:  A map from the chromosome to Exon. This array is sorted.
 *       Return:
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::ExtractExon(string from_file, map_str2vec_exon& exons)
{
	fstream infile;
	infile.open(from_file.data(), ios::in);
	if (!infile.is_open())
	{
		cerr << "File " << from_file.data() << " can not be opened" << endl;
		return false;
	}

	map_str2vec_int bars;

	// read the data and fill the vector
	int org_total = 0;
	vector<string> splitted;
	int line_cnt = 0;
	string line;
	while (getline(infile, line))
	{
		splitted = Utility::Split('\t', line);

		string chr;
		// chr is the combination of chromosome and strand
		if (mbStrandSpecific)
			chr = splitted[1]+splitted[2];
		else
			chr = splitted[1];
		bool strand = (splitted[2] == "+");
		int exon_cnt = atoi(splitted[7].data());
		vector<string> starts = Utility::Split(',', splitted[8]);
		vector<string> ends = Utility::Split(',', splitted[9]);

		if (starts.size() != ends.size())
			cerr << "DATA ERROR, line " << line_cnt << " : Starts cnt != Ends cnt" << endl;
		if (exon_cnt != starts.size())
		{
			cerr << exon_cnt << "," << starts.size() -1 << endl;
			cerr << "DATA ERROR, line " << line_cnt << " : Exon cnt != Starts cnt" << endl;
		}

		// Fill bars
		if (bars.find(chr) == bars.end())
		{
			vector<int> av;
			bars[chr] = av;
		}

		vector<int>& av = bars[chr];
		for (int i = 0; i < exon_cnt; i++)
		{
			av.push_back(atoi(starts[i].data()));
			av.push_back(atoi(ends[i].data()));
		}

		// Fill exons
		if (exons.find(chr) == exons.end())
		{
			vector<Exon> avf;
			exons[chr] = avf;
		}
		vector<Exon>& avf = exons[chr];

		for (int i = 0; i < exon_cnt; i++)
		{
			Exon exon;
			exon.mStart = atoi(starts[i].data());
			exon.mEnd = atoi(ends[i].data());
			exon.mChr = chr;
			exon.mStrand = strand;
			avf.push_back(exon);
			org_total++;
		}

		line_cnt++;
	}

	// On each chromosome, partition the exons
	for_each_ele_in_group(iter, map_str2vec_int, bars)
	{
		vector<Exon>& curr_exons = exons[iter->first];
		vector<int>& curr_bars = iter->second;

		// Sort the bars and remove dups
		sort(curr_bars.begin(), curr_bars.end());
		int cnt = 0;
		for (int i = 1; i < curr_bars.size(); i++)
		{
			if (curr_bars[i] != curr_bars[cnt])
				/* :BUG:02/04/2009 09:05:47 PM:feeldead: cnt++ should be ++cnt */
				//curr_bars[cnt++] = curr_bars[i];  
				curr_bars[++cnt] = curr_bars[i];  
		}
		cnt++;
		curr_bars.resize(cnt);

		vector<Exon> temp_exons;

		// Partition each exon on this chromosome
		for (int i = 0; i < curr_exons.size(); i++)
		{
			// binary search the start position of this exon in curr_bars
			int idx = Utility2Temp<int>::BinarySearch(curr_bars, curr_exons[i].mStart);
			if (curr_bars[idx] != curr_exons[i].mStart)
				cerr << "ERROR : curr_bars[idx] != curr_exons[i].mStart" << endl;

			int end = curr_exons[i].mEnd;
			curr_exons[i].mEnd = curr_bars[++idx];
			while (end > curr_bars[idx])
			{
				Exon exon;
				exon.mStart = curr_bars[idx];
				exon.mEnd = curr_bars[idx+1];
				exon.mChr = curr_exons[i].mChr;
				exon.mStrand = curr_exons[i].mStrand;
				exon.mBothCnt = 0;
				exon.mStartCnt = 0;
				exon.mEndCnt = 0;
				temp_exons.push_back(exon);
				idx++;
			}
		}

		for (int i = 0; i < temp_exons.size(); i++)
			curr_exons.push_back(temp_exons[i]);

		// Sort the exons and remove dups
		sort(curr_exons.begin(), curr_exons.end());
		cnt = 0;
		for (int i = 1; i < curr_exons.size(); i++)
		{
			if (curr_exons[i] != curr_exons[cnt])
				curr_exons[++cnt] = curr_exons[i];  
		}
		cnt++;
		curr_exons.resize(cnt);
	}

	int total = 0;
	int cnt_short_read = 0;
	for_each_ele_in_group(iter, map_str2vec_exon, exons)
	{
		vector<Exon>& curr_exons = exons[iter->first];
		total += curr_exons.size();
		for (int i = 0; i < curr_exons.size(); i++)
		{
			if (curr_exons[i].mEnd - curr_exons[i].mStart < mReadLen)
				cnt_short_read++;
		}
	}

	cout << "Ori : Part : Short = " << org_total << " : " << total << " : " << cnt_short_read << endl;
	return true;
}		/* -----  end of method DataProcessor::ExtractExon  ----- */

/*
 *--------------------------------------------------------------------------------------
 *       Class:  DataProcessor
 *      Method:  LoadNonJunctionReads 
 * Description:  Specify the positions and length, this method extract the sequences
 *       Param:  start_pos_of_first_nt : The position of the first neocliotide on a chromosome.
 *      Return:
 *--------------------------------------------------------------------------------------
 */
	bool
DataProcessor::ExtractRef(string& refseq_file, map<string, map<int, string> >& pos2refseq, int cross_len, int start_pos_of_first_nt)
{
	// Scan the refseq, extract the required segments 
	fstream infile;
	infile.open(refseq_file.data(), ios::in);
	if (!infile.is_open())
	{
		cerr << "File " << refseq_file.data() << " can not be opened" << endl;
		return false;
	}

	int nt_cnt = start_pos_of_first_nt;
	string line;
	char buf[cross_len];
	char buf_seq[cross_len+1];
	buf_seq[cross_len] = 0;
	int buf_pointer = 0;
	string curr_chr;
	map<int, string>* p_map_pos = 0;
	map<int, string>* p_map_neg = 0;
	set<string> exist_chr;
	while (getline(infile, line))
	{
		if (line[0] == '>')
		{
			if (nt_cnt != start_pos_of_first_nt)
				cout << "Length of chromosome " << curr_chr << " is " << nt_cnt << endl;
			nt_cnt = start_pos_of_first_nt;
			buf_pointer = 0;
			vector<char> delimits;
			delimits.push_back('>');
			delimits.push_back('|');
			vector<string> splitted = Utility::Split(delimits, line);
			curr_chr = splitted[1];

			p_map_pos = 0;
			p_map_neg = 0;
			if (mbStrandSpecific)
			{
				string chr = curr_chr + "+";
				if (pos2refseq.find(chr) != pos2refseq.end())
					p_map_pos = &pos2refseq[chr];
				chr = curr_chr + "-";
				if (pos2refseq.find(chr) != pos2refseq.end())
					p_map_neg = &pos2refseq[chr];
			}
			else
			{
				if (pos2refseq.find(curr_chr) != pos2refseq.end())
					p_map_pos = &pos2refseq[curr_chr];
			}
			exist_chr.insert(curr_chr);
			continue;
		}

		const char* cont = line.data();
		// fill the buffer
		for (int c = 0; c < line.length(); c++)
		{
			buf[buf_pointer] = cont[c];	

			// Note, +1 here
			int pos = nt_cnt-cross_len+1;  
			if (p_map_pos && p_map_pos->find(pos) != p_map_pos->end())
			{
				int idx = buf_pointer;
				for (int i = cross_len - 1; i >= 0; i--)
				{
					buf_seq[i] = buf[idx];
					idx = (idx + cross_len - 1) % cross_len;
				}
				(*p_map_pos)[pos] = buf_seq;
			}
			if (p_map_neg && p_map_neg->find(pos) != p_map_neg->end())
			{
				int idx = buf_pointer;
				for (int i = cross_len - 1; i >= 0; i--)
				{
					buf_seq[i] = buf[idx];
					idx = (idx + cross_len - 1) % cross_len;
				}
				(*p_map_neg)[pos] = buf_seq;
			}
			buf_pointer = (buf_pointer + cross_len + 1) % (cross_len);
			nt_cnt++;
		}
	}

	// For the tail
	for (int k = 1; k < cross_len; k++)
	{
		int pos = nt_cnt - k;

		if (p_map_pos && p_map_pos->find(pos) != p_map_pos->end())
		{
			int idx = (buf_pointer + cross_len - 1) % cross_len;
			for (int i = k-1; i >= 0; i--)
			{
				buf_seq[i] = buf[idx];
				idx = (idx + cross_len - 1) % cross_len;
			}
			buf_seq[k] = 0;
			(*p_map_pos)[pos] = buf_seq;
		}
		if (p_map_neg && p_map_neg->find(pos) != p_map_neg->end())
		{
			int idx = (buf_pointer + cross_len - 1) % cross_len;
			for (int i = k-1; i >= 0; i--)
			{
				buf_seq[i] = buf[idx];
				idx = (idx + cross_len - 1) % cross_len;
			}
			buf_seq[k] = 0;
			(*p_map_neg)[pos] = buf_seq;
		}
	}

	infile.close();

	return true;
}		/* -----  end of method DataProcessor::ExtractRef  ----- */

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  LoadNonJunctionReads
 *  Description:  Given the mapped reads and the gene isoform information, this function
 *                count the number of reads that starting from and ending at each exons.
 *                The format of the mapping is:
 *                chromosome    strand   start 
 *        Param:  
 *       Return:
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::LoadNonJunctionReads(string from_file, map_str2vec_exon& exons, 
									map<string, map_64_double>& junc_counts, int& mapped_cnt)
{
	fstream infile;
	infile.open(from_file.data(), ios::in);
	if (!infile.is_open())
	{
		cerr << "File " << from_file.data() << " can not be opened" << endl;
		return false;
	}

	int tot_cnt;
	int skipped_cnt;
	map_str2int warning_cnt;

	cout << "Loading single-end short reads" << endl;
	// read the data and fill the vector
	vector<string> splitted;
	skipped_cnt = 0;
	tot_cnt = 0;
	string line;
	while (getline(infile, line))
	{
		splitted = Utility::Split('\t', line);

		string chr;
		if (mbStrandSpecific)
			chr = splitted[0]+splitted[1];
		else
			chr = splitted[0];
		int64 start = atoi(splitted[2].data());
		int64 end = start + mReadLen - 1;

		if (junc_counts.find(chr) == junc_counts.end())
		{
			map_64_double exon_pair;
			junc_counts[chr] = exon_pair;
		}
		map_64_double& exon_pair = junc_counts[chr];

		bool b_drop = true;
		if (exons.find(chr) != exons.end()) 
		{
			vector<Exon>& av = exons[chr];

			int exon_idx1 = FindExon(av, start);
			if (-1 != exon_idx1 && start <= av[exon_idx1].mEnd)
			{
				av[exon_idx1].mStartCnt += 1;
				b_drop = false;
			}

			int exon_idx2 = FindExon(av, end);
			if (-1 != exon_idx2 && end <= av[exon_idx2].mEnd)
			{
				av[exon_idx2].mEndCnt += 1;
				b_drop = false;
			}

			if (-1 != exon_idx1 && exon_idx1 == exon_idx2 && end < av[exon_idx2].mEnd)
				av[exon_idx2].mBothCnt += 1;

			for (int i = exon_idx1; i < exon_idx2-1; i++)
			{
				int64 pair = Utility::combine64(i, i+1);
				if (exon_pair.find(pair) == exon_pair.end())
					exon_pair[pair] = 1;
				else
					exon_pair[pair] += 1;
			}
		}
		else
		{
			if (warning_cnt.find(splitted[0]) == warning_cnt.end())
				warning_cnt[splitted[0]] = 1;
			else
				warning_cnt[splitted[0]]++;
			if (warning_cnt[splitted[0]] < 10)
				cerr << "WARNING : line " << tot_cnt+1 << " contains a chromosome " << splitted[0]
					 << ", on which no exon information is available" << endl;
		}

		if (b_drop) skipped_cnt++;
		tot_cnt++;

		if (tot_cnt % 100000 == 0)
		{
			cout << "   "  << tot_cnt << " reads are scaned.";
			cout << "   "  << skipped_cnt << " reads are droped." << endl;
		}
	}
	cout << "Final : "<< endl;
	cout << "   "  << tot_cnt << " reads are scaned." << endl;
	cout << "   "  << tot_cnt - skipped_cnt << " reads are mapped." << endl;
	cout << "   "  << skipped_cnt << " reads are droped." << endl;
	for_each_ele_in_group(iter, map_str2int, warning_cnt)
	{
		cerr << iter->second << " reads on chromosome " << iter->first << " have no exon information" << endl;
	}
	infile.close();

	mapped_cnt = tot_cnt - skipped_cnt;
	return true;
}		/* -----  end of method DataProcessor::LoadNonJunctionReads  ----- */

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  LoadJunctionReads
 *  Description:  Load the reads that acrose two exons.
 *        Param:  junc_counts[o], A map from exon pairs to how many count falling onto 
 *                the pair of exons.
 *       Return:
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::LoadJunctionReads(string from_file, map_str2vec_exon& exons, map<string, map_64_double>& junc_counts)
{
	fstream infile;
	infile.open(from_file.data(), ios::in);
	if (!infile.is_open())
	{
		cerr << "File " << from_file.data() << " can not be opened" << endl;
		return false;
	}

	// read the data and fill the vector
	vector<string> splitted;
	int tot_cnt = 0;
	int skipped_cnt = 0;
	string line;
	while (getline(infile, line))
	{
		splitted = Utility::Split('\t', line);

		string chr;
		if (mbStrandSpecific)
			chr = splitted[0]+splitted[1];
		else
			chr = splitted[0];
		int start1 = atoi(splitted[2].data());
		int start2 = atoi(splitted[3].data());
		int cross_len = mReadLen - mCrossStrength; 
		int shift = atoi(splitted[4].data());

		assert(start2 > start1);

		vector<Exon>& curr_exons = exons[chr];

		if (junc_counts.find(chr) == junc_counts.end())
		{
			map_64_double exon_pair;
			junc_counts[chr] = exon_pair;
		}
		map_64_double& exon_pair = junc_counts[chr];

		tot_cnt++;

		if (tot_cnt % 100000 == 0)
		{
			cout << "   "  << tot_cnt << " junction reads are scaned.";
			cout << "   "  << skipped_cnt << " junction reads are skipped." << endl;
		}

		// Find exons on which the start / end point of the read rely.
		int exon_idx1 = FindExon(curr_exons, start1+shift);
		int exon_idx2 = FindExon(curr_exons, start2+(mReadLen-(cross_len-shift))-1);
		if (exon_idx1 > 0) curr_exons[exon_idx1].mStartCnt++;
		if (exon_idx2 > 0) curr_exons[exon_idx2].mEndCnt++;

		exon_idx1 = FindExon(curr_exons, start1+cross_len-1);
		exon_idx2 = FindExon(curr_exons, start2);
		// The read does not map to the same strand of the junction
		if (exon_idx1 < 0 || exon_idx1 >= curr_exons.size() || curr_exons[exon_idx1].mEnd != start1+cross_len ||
		    exon_idx2 < 0 || exon_idx2 >= curr_exons.size() || curr_exons[exon_idx2].mStart != start2)
		{
			skipped_cnt++;	
			continue;
		}


		/*
		 * * : exon
		 * + : the last element of a exon
		 * - : intron
		 * = : read
		 *
		 * exon                     1           2       3      4      5         6
		 *                    ************+--*********+-****+---**+---***+**************+
		 *                          1           2     4       5           6
		 * juncref(2,4)       ************+--*********+**+---***+**************+
		 *                    ^start1                  ^start2
		 *                                            ^start1+cross_len-1 (pointing to the last element of exon 2)
		 *                    |-----  cross_len  -----| 
		 *                                             |----  cross_len  ------| 
		 * read                     ====================================
		 *                          ^start1+shift                      ^start2+(mReadLen-(cross_len-shift))-1
		 *                    |- shift -| 
		 * In this case, (2,4), (5,6) are observed junctions
		 *               (1,2), (4,5) are not, because they are disconnected by introns
		 * The read can be divided into three parts:
		 * [start1+shift,start1+cross_len-1], [start1+cross_len-1, start2], [start2, start2+(mReadLen-(cross_len-shift))-1]
		 */

		vector<int64> involved_pairs;

		int64 pair = Utility::combine64(exon_idx1, exon_idx2);
		involved_pairs.push_back(pair);

		// On the left of exon_idx1, gradually find exons that are adjacent
		int start_exon = exon_idx1; 
		while (start_exon > 0 && curr_exons[start_exon-1].mEnd == curr_exons[start_exon].mStart && curr_exons[start_exon].mStart > start1)
		{
			int64 pair = Utility::combine64(start_exon-1, start_exon);
			involved_pairs.push_back(pair);
			start_exon--;
		}
		// On the right of exon_idx1, gradually find exons that are adjacent
		int read_end_pos = start2+(mReadLen-(cross_len-shift))-1;
		int end_exon  = exon_idx2; 
		while (end_exon < curr_exons.size()-1 && curr_exons[end_exon].mEnd == curr_exons[end_exon+1].mStart && curr_exons[end_exon].mEnd < read_end_pos)
		{
			int64 pair = Utility::combine64(end_exon, end_exon+1);
			involved_pairs.push_back(pair);
			end_exon++;
		}

		for (int i = 0; i < involved_pairs.size(); i++)
		{
			int64& pair = involved_pairs[i];
			if (exon_pair.find(pair) == exon_pair.end())
				exon_pair[pair] = 1;
			else
				exon_pair[pair] += 1;
		}
	}

	cout << "Final : "<< endl;
	cout << "   "  << tot_cnt << " reads are scaned." << endl;
	cout << "   "  << tot_cnt - skipped_cnt << " reads are mapped." << endl;
	cout << "   "  << skipped_cnt << " reads are droped." << endl;

	infile.close();
	return true;
}		/* -----  end of method DataProcessor::LoadJunctionReads  ----- */

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  AppendStrictPEInfo
 *  Description:  
 *        Param:  pe_info_file : See -pe_info for the format of this file. 
 *       Return:  
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::AppendStrictPEInfo(string pe_info_file, RandomExpReadAssignerIM& semi_rand_gen)
{
	vector<PEInfo> pe_infos;
	vector<string> mapping_files;

	if (!LoadPEInfo(pe_info_file, pe_infos, mapping_files))
		return false;

	for (unsigned i = 0; i < pe_infos.size(); i++)
	{
		pe_infos[i].mReadCnt = -1;
		semi_rand_gen.AddPEInfo(pe_infos[i]);
	}

	return true;
}

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  LoadPEInfo
 *  Description:  Load PE info.  
 *        Param:  pe_info_file : See -pe_info for the format of this file. 
 *                all_pair_counts  :  Each element of all_pair_counts corresponds to the
 *                    PE information in a file.
 *       Return:  
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::LoadPEInfo(string pe_info_file, 
						  vector<PEInfo>& pe_infos,
						  vector<string>& mapping_files)
{
	fstream infile;
	infile.open(pe_info_file.data(), ios::in);
	if (!infile.is_open())
	{
		cerr << "File " << pe_info_file.data() << " can not be opened" << endl;
		return false;
	}

	// read the data and fill the vector
	vector<string> splitted;
	string line;
	while (getline(infile, line))
	{
		PEInfo pe_info;
		splitted = Utility::Split(' ', line);
		pe_info.mReadLen = atoi(splitted[0].data());
		pe_info.mSpanMean = atoi(splitted[1].data());
		pe_info.mSpanStd = atoi(splitted[2].data());

		pe_infos.push_back(pe_info);
		mapping_files.push_back(splitted[3]);
	}
	infile.close();
	return true;
}

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  LoadPEReads
 *  Description:  Load PE info.  
 *        Param:  pe_info_file : See -pe_info for the format of this file. 
 *                all_pair_counts  :  Each element of all_pair_counts corresponds to the
 *                    PE information in a file.
 *       Return:  
 *--------------------------------------------------------------------------------------
 */
void
DataProcessor::LoadPEReads(vector<PEInfo>& pe_infos, vector<string> mapping_files, vector<Instance>& all_instances)
{
	map_str2vec_exon exons;

	for (unsigned ins = 0; ins < all_instances.size(); ins++)
	{
		Instance& an_instance = all_instances[ins];
		an_instance.mSpliceReadCntPE.resize(an_instance.mSampleCnt.size());
		for (unsigned i = 0; i < an_instance.mSpliceReadCntPE.size(); i++)
			an_instance.mSpliceReadCntPE[i].assign(an_instance.mSampleCnt.size(), 0);

		vector<Exon>& curr_exons = an_instance.mExons;
		for (unsigned i = 0; i < curr_exons.size(); i++)
		{
			string chr = curr_exons[i].mChr;
			string strand = (curr_exons[i].mStrand ? "+" : "-");
			if (mbStrandSpecific)
				chr += strand;

			if (exons.find(chr) == exons.end())
			{
				vector<Exon> empty_vec;
				exons[chr] = empty_vec;
			}
			exons[chr].push_back(curr_exons[i]);
		}
	}

	for_each_ele_in_group(iter, map_str2vec_exon, exons)
		sort(iter->second.begin(), iter->second.end());

	for (unsigned i = 0; i < pe_infos.size(); i++)
		LoadPEReads(mapping_files[i], exons, pe_infos[i], all_instances);
}

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  LoadPEReads
 *  Description:  Load the paired-end reads.  
 *        Param:  pe_info_seed  :  The span information will be copied to every instances
 *       Return:  
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::LoadPEReads(string from_file, map<string, vector<Exon> >& exons, 
		PEInfo& pe_info_seed, vector<Instance>& all_instances)
{
	fstream infile;
	infile.open(from_file.data(), ios::in);
	if (!infile.is_open())
	{
		cerr << "File " << from_file.data() << " can not be opened" << endl;
		return false;
	}

	cout << "Loading PE reads from " << from_file << endl;

	// For every exon, find the instance to which it belongs
	// Setup the mapping from exon ID to the index of exons on every instance
	map_str2vec_int exon_2_instance;
	map_str2vec_int exon_2_local_idx;
	for_each_ele_in_group(iter, map_str2vec_exon, exons)
	{
		string curr_chr = iter->first;
		vector<int> null_vec;
		exon_2_instance[curr_chr] = null_vec;
		exon_2_local_idx[curr_chr] = null_vec;
		vector<Exon>& curr_exons = iter->second;
		exon_2_instance[curr_chr].assign(curr_exons.size(), -1);
		exon_2_local_idx[curr_chr].assign(curr_exons.size(), -1);
	}

	for (unsigned ins = 0; ins < all_instances.size(); ins++)
	{
		Instance& an_instance = all_instances[ins];

		string curr_chr = an_instance.mExons[0].mChr;
		vector<Exon>& exons_on_curr_chr = exons[curr_chr];
		vector<int>& curr_vec = exon_2_instance[curr_chr];
		vector<int>& curr_local_idx = exon_2_local_idx[curr_chr];

		for (unsigned i = 0; i < an_instance.mSetSizes.size(); i++)
		{
			int start = an_instance.mExons[i].mStart;
			int exon_idx = FindExon(exons_on_curr_chr, start);
			curr_vec[exon_idx] = ins;
			curr_local_idx[exon_idx] = i;
		}

		an_instance.mPEInfo.resize(an_instance.mPEInfo.size()+1);
	}
	
	// read the data and fill the vector
	vector<string> splitted;
	int tot_cnt = 0;
	int line_cnt = 0;
	string line;
	while (getline(infile, line))
	{
		line_cnt++;
		if (line_cnt % 100000 == 0)
			cout << "   "  << line_cnt << " PE reads are scaned." << endl;

		splitted = Utility::Split('\t', line);

		string chr;
		if (mbStrandSpecific)
			chr = splitted[0]+splitted[1];
		else
			chr = splitted[0];
		int start1 = atoi(splitted[2].data());
		int start2 = atoi(splitted[4].data());
		int start3 = atoi(splitted[6].data());
		int start4 = atoi(splitted[8].data());

		vector<Exon>& av = exons[chr];
		vector<int>& curr_vec = exon_2_instance[chr];

		int exon_idx1 = FindExon(av, start1);
		int exon_idx2 = -1;
		if (-1 != start2) 
			exon_idx2 = FindExon(av, start2);
		int exon_idx3 = FindExon(av, start3);
		int exon_idx4 = -1;
		if (-1 != start4) 
			exon_idx4 = FindExon(av, start4);

		if (-1 == curr_vec[exon_idx1])
		{
			cerr << "WARNING : line " << line_cnt << ". Exon 1 does not appear in any instance." << endl;
			continue;
		}
		if (exon_idx2 != -1 && -1 == curr_vec[exon_idx2])
		{
			cerr << "WARNING : line " << line_cnt << ". Exon 2 does not appear in any instance." << endl;
			continue;
		}
		if (-1 == curr_vec[exon_idx3])
		{
			cerr << "WARNING : line " << line_cnt << ". Exon 3 does not appear in any instance." << endl;
			continue;
		}
		if (exon_idx4 != -1 && -1 == curr_vec[exon_idx4])
		{
			cerr << "WARNING : line " << line_cnt << ". Exon 4 does not appear in any instance." << endl;
			continue;
		}
		
		// If these four exons do not belong to the same gene. skip
		if (curr_vec[exon_idx1] != curr_vec[exon_idx3] ||
			exon_idx2 != -1 && curr_vec[exon_idx1] != curr_vec[exon_idx2] ||
			exon_idx4 != -1 && curr_vec[exon_idx3] != curr_vec[exon_idx4])
		{
			cerr << "WARNING : line " << line_cnt << " . The exons do not belong to the same instance" << endl;
			cerr << curr_vec[exon_idx1] << "\t" <<
				    curr_vec[exon_idx2] << "\t" <<
					curr_vec[exon_idx3] << "\t" <<
					curr_vec[exon_idx4] << endl;
			continue;
		}

		// If the whole PE read is in the same exon
		if (exon_idx1 == exon_idx3 && -1 == exon_idx4) continue;

		int curr_ins_idx = curr_vec[exon_idx1];
		Instance& an_instance = all_instances[curr_ins_idx];

		PEInfo& pe_info = an_instance.mPEInfo[an_instance.mPEInfo.size()-1];

		vector<int>& curr_local_idx = exon_2_local_idx[chr];

		vector<vector<double> >& splice_read_cnt_pe  = an_instance.mSpliceReadCntPE;
		if (-1 != exon_idx2) splice_read_cnt_pe[curr_local_idx[exon_idx1]][curr_local_idx[exon_idx2]] += 1;
		if (-1 != exon_idx4) splice_read_cnt_pe[curr_local_idx[exon_idx3]][curr_local_idx[exon_idx4]] += 1;

		set<int> local_idx;
		local_idx.insert(curr_local_idx[exon_idx1]);
		if (-1 != exon_idx2) local_idx.insert(curr_local_idx[exon_idx2]);
		local_idx.insert(curr_local_idx[exon_idx3]);
		if (-1 != exon_idx4) local_idx.insert(curr_local_idx[exon_idx4]);

		vector<bool> part_comb;
		part_comb.assign(an_instance.mSetSizes.size(), false);
		for_each_ele_in_group(iter, set<int>, local_idx)
			part_comb[*iter] = true;

		bool b_exist = false;
		for (unsigned i = 0; i < pe_info.mPartComb.size(); i++)
		{
			if (part_comb == pe_info.mPartComb[i])
			{
				b_exist = true;
				pe_info.mPartCombDup[i]++;
				break;
			}
		}
		if (!b_exist)
		{
			pe_info.mPartComb.push_back(part_comb);
			pe_info.mPartCombDup.push_back(1);
		}

		tot_cnt++;
	}

	infile.close();

	// Fill the last PE information of every instance related to span and read cnt
	for (unsigned ins = 0; ins < all_instances.size(); ins++)
	{
		Instance& an_instance = all_instances[ins];
		PEInfo& pe_info = an_instance.mPEInfo[an_instance.mPEInfo.size()-1];
		pe_info.mReadLen = pe_info_seed.mReadLen;
		pe_info.mSpanLow = pe_info_seed.mSpanLow;
		pe_info.mSpanHigh = pe_info_seed.mSpanHigh;
		pe_info.mSpanMean = pe_info_seed.mSpanMean;
		pe_info.mSpanStd = pe_info_seed.mSpanStd;
		pe_info.mReadCnt = tot_cnt;
	}

	return true;
}		/* -----  end of method DataProcessor::LoadJunctionReads  ----- */

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  Decompose
 *  Description:  Given a interval, decompose it into exons. This interval is supposed
 *                start from the start position of some exon and end at the end position
 *                of some exon.
 *        Param:  exons_idx[o]  :  Stores the index of decomposed exons.
 *       Return:
 *--------------------------------------------------------------------------------------
 */
void
DataProcessor::Decompose(vector<Exon>& exons, int start, int end, vector<int>& exons_idx)
{
	// Find the exon with start position == start
	
	// binary search
	int low = 0;
	int high = exons.size() - 1;
	int mid = -1;

	while (low <= high)
	{
		mid = (low + high) / 2;

		if (exons[mid].mStart < start)
			low = mid + 1;
		else if (exons[mid].mStart > start)
			high = mid - 1;
		else if (exons[mid].mStart == start)
			break;
	}

	if (exons[mid].mStart != start)
	{
		cerr << __func__ << " ERROR : exons[mid].mStart != start" << endl;
		exit(1);
	}

	while (mid < exons.size())
	{
		exons_idx.push_back(mid);
		if (exons[mid].mEnd < end)
			mid++;
		else 
			break;
	}
	if (exons[mid].mEnd != end)
	{
		cerr << __func__ << " ERROR : exons[mid].mEnd != end" << endl;
		exit(1);
	}
	return ;
}		/* -----  end of method DataProcessor::Decompose  ----- */

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  FindExon
 *  Description:  Given a position, find the exon that contains this position, if no such
 *                exon exists, the one with greatest start position which is less than
 *                the given position is returned
 *        Param:  pos
 *       Return:  
 *         Note:  Make sure that exons has been sorted according to their start position.
 *--------------------------------------------------------------------------------------
 */
	int
DataProcessor::FindExon(vector<Exon>& exons, int64 pos)
{
	// binary search
	int low = 0;
	int high = exons.size() - 1;
	int mid = -1;

	while (low <= high)
	{
		mid = (low + high) / 2;
		if (pos < exons[mid].mStart)
			high = mid - 1;
		else if (pos > exons[mid].mStart)
			low = mid + 1;
		else if (pos = exons[mid].mStart)
		{
			high = mid;
			break;
		}
	}

	return high;
}		/* -----  end of method DataProcessor::FindExon  ----- */


/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  LoadGenes
 *  Description:  Load genes from a file
 *        Param:  gene_file_name     :  The format of this file is :
 *   	              name chromosome  strand  start_position end_position exon_start_positions, exon_end_positions
 *   	          genes  :  genes should be sorted by the start positions.
 *       Return:
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::LoadGenes(string gene_file_name, map_str2vec_gene& genes)
{
	fstream infile;
	infile.open(gene_file_name.data(), ios::in);
	if (!infile.is_open())
	{
		cerr << "File " << gene_file_name.data() << " can not be opened" << endl;
		return false;
	}
	cout << "Loading genes" << endl;

	vector<string> splitted;
	int line_cnt = 0;
	string line;
	set<string> gene_names;
	while (getline(infile, line))
	{
		line_cnt++;
		splitted = Utility::Split('\t', line);

		string name = splitted[0];
		string chr = splitted[1];

		if (gene_names.find(name) != gene_names.end())
		{
			cerr << "WARNING : gene " << name << " has appeared before. Line " << line_cnt << endl;
			continue;
		}
		gene_names.insert(name);

		if (mbStrandSpecific)
			chr += splitted[2];

		vector<string> starts;
		vector<string> ends;

		if (splitted.size() >= 7)
		{
			starts = Utility::Split(',', splitted[5]);
			ends = Utility::Split(',', splitted[6]);
		}

		if (ends.size() != starts.size())
		{
			cerr << "ERROR, line " << line_cnt << " : Exon starts cnt != ends cnt" << endl;
			continue;
		}

		if (genes.find(chr) == genes.end())
		{
			vector<Gene> empty_vec;
			genes[chr] = empty_vec;
		}

		Gene a_gene;
		a_gene.mName = name;
		a_gene.mChr = chr;
		a_gene.mStrand = (splitted[2] == "+");
		a_gene.mStart = atoi(splitted[3].data());
		a_gene.mEnd = atoi(splitted[4].data());

		vector<Exon>& exons = a_gene.mExons;
		exons.resize(starts.size());
		for (int i = 0; i < starts.size(); i++)
		{
			exons[i].mStart = atoi(starts[i].data());
			exons[i].mEnd = atoi(ends[i].data());
		}

		genes[chr].push_back(a_gene);
	}

	infile.close();
	cout << "Loading genes done. " << gene_names.size() << " genes have been loaded." << endl;

	for_each_ele_in_group(iter, map_str2vec_gene, genes)
		sort(iter->second.begin(), iter->second.end());

	return true;
}		/* -----  end of method DataProcessor::LoadGenes ----- */

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  LoadBoundary
 *  Description:  Load exon-intron boundaries from a file
 *        Param:  boundary_file_name :  The format of the file is :
 *                    chromosome  strand  position type
 *                    The position is always the position of the first base 
 *                    of an exon or intron. 
 *       Return:
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::LoadBoundary(string boundary_file_name, map_str2vec_boundary& boundaries)
{
	fstream infile;
	infile.open(boundary_file_name.data(), ios::in);
	if (!infile.is_open())
	{
		cerr << "File " << boundary_file_name.data() << " can not be opened" << endl;
		return false;
	}

	map_str2map_int642int bars;

	cout << "Loading boundaries" << endl;
	vector<string> splitted;
	int line_cnt = 0;
	string line;
	while (getline(infile, line))
	{
		line_cnt++;
		splitted = Utility::Split('\t', line);

		string chr = splitted[0];
		string strand = splitted[1];
		if (mbStrandSpecific)
			chr += strand;

		int64 position = atoi(splitted[2].data());
		int type = atoi(splitted[3].data());
		if (type != 0 && type != 1)
		{
			cerr << "WARNING : on line " << line_cnt << ", the type of boundary is not 0 or 1" << endl;
			continue;
		}

		// Fill bars
		if (bars.find(chr) == bars.end())
		{
			map<int64,int> empty_set;
			bars[chr] = empty_set;
		}

		map<int64,int>& curr_bounds = bars[chr];

		if (curr_bounds.find(position) == curr_bounds.end())
			curr_bounds[position] = type;
		else
		{
			if (curr_bounds[position] != type)
				curr_bounds[position] = 2;
		}
	}

	infile.close();

	int bound_cnt = 0;
	for_each_ele_in_group(iter, map_str2map_int642int, bars)
	{
		bound_cnt += iter->second.size();
		vector<Boundary> empty_vec;
		boundaries[iter->first] = empty_vec;

		vector<Boundary>& curr_bounds = boundaries[iter->first];
		curr_bounds.resize(iter->second.size());

		int i = 0;
		for_each_ele_in_group(iter2, map_int642int, iter->second)
		{
			Boundary& bound = curr_bounds[i++];
			bound.mPos = iter2->first;
			bound.mType = iter2->second;
		}

		sort(curr_bounds.begin(), curr_bounds.end());
	}

	cout << "Loading boundaries done. " << bound_cnt << " boundaries have been loaded." << endl;
		
	return true;
}		/* -----  end of method DataProcessor::LoadBoundary ----- */

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  LoadTSSPAS
 *  Description:  Load TSSs and PASs from a file
 *        Param:  tss_pas_file_name :  The format of the file is :
 *                    gene_name   TSSs   PASs
 *                    TSSs or PASs are sepereted by comma. In each line, an isoform
 *                    starting from one element in TSSs must end with some element in
 *                    PASs. There is no constraint on different lines.
 *       Return:
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::LoadTSSPAS(string tss_pas_file_name, map_str2vec_gene& genes)
{
	fstream infile;
	infile.open(tss_pas_file_name.data(), ios::in);
	if (!infile.is_open())
	{
		cerr << "File " << tss_pas_file_name.data() << " can not be opened" << endl;
		return false;
	}

	map<string, vector<vector<int64> > >  TSSs;
	map<string, vector<vector<int64> > >  PASs;

	map_str2int gene_starts;
	map_str2int gene_ends;

	for_each_ele_in_group(iter, map_str2vec_gene, genes)
	{
		vector<Gene>& av = iter->second;
		for (unsigned i = 0; i < av.size(); i++)
		{
			gene_starts[av[i].mName] = av[i].mStart;
			gene_ends[av[i].mName] = av[i].mEnd;
		}
	}

	cout << "Loading TSSs and PASs" << endl;
	vector<string> splitted;
	int line_cnt = 0;
	string line;
	while (getline(infile, line))
	{
		line_cnt++;
		splitted = Utility::Split('\t', line);

		string& gene_name = splitted[0];

		if (gene_starts.find(gene_name) == gene_starts.end())
		{
			cerr << "WARNING : " << gene_name << " in line " << line_cnt << " does not appear in gene file" << endl;
			continue;
		}

		vector<string> tss = Utility::Split(',', splitted[1]);
		vector<string> pas = Utility::Split(',', splitted[2]);

		vector<int64> a_group_tss;
		for (unsigned i = 0; i < tss.size(); i++)
		{
			int pos = atoi(tss[i].data());

			if (gene_starts[gene_name] > pos || gene_ends[gene_name] <= pos)
			{
				cerr << "WARNING : the position " << pos << " in line " << line_cnt 
					 << " is out of the gene boundary" << endl;
			}
			else
				a_group_tss.push_back(pos);
		}

		vector<int64> a_group_pas;
		for (unsigned i = 0; i < pas.size(); i++)
		{
			int pos = atoi(pas[i].data());

			if (gene_starts[gene_name] >= pos || gene_ends[gene_name] < pos)
			{
				cerr << "WARNING : the position " << pos << " in line " << line_cnt 
					 << " is out of the gene boundary" << endl;
			}
			else
				a_group_pas.push_back(pos);
		}

		if (TSSs.find(gene_name) == TSSs.end())
		{
			vector<vector<int64> > empty_vec;
			TSSs[gene_name] = empty_vec;
			PASs[gene_name] = empty_vec;
		}

		if (a_group_tss.size() == 0)
		{
			cerr << "WARNING : no TSS is found in line " << line_cnt << endl;
			continue;
		}
		if (a_group_pas.size() == 0)
		{
			cerr << "WARNING : no PAS is found in line " << line_cnt << endl;
			continue;
		}

		TSSs[gene_name].push_back(a_group_tss);
		PASs[gene_name].push_back(a_group_pas);
	}

	infile.close();
	cout << "Loading TSSs and PASs done. " << endl;

	for_each_ele_in_group(iter, map_str2vec_gene, genes)
	{
		vector<Gene>& av = iter->second;
		for (unsigned i = 0; i < av.size(); i++)
		{
			string& name = av[i].mName;
			av[i].mTSSs = TSSs[name];
			av[i].mPASs = PASs[name];
		}
	}

	return true;
}		/* -----  end of method DataProcessor::LoadTSSPAS ----- */


/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  LoadGeneAndExon
 *  Description:  Load exon-intron boundaries from a file
 *        Param:  The TSSPAS and boundaries are extracted from gene_file_name.
 *       Return:
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::LoadGeneAndExon(string gene_file_name, map_str2vec_gene& genes, map_str2vec_exon& exons)
{
	if (!LoadGenes(gene_file_name, genes)) return false;

	map_str2vec_boundary boundaries;

	// Extract all the boundaries including TSSs and PASs on each chromosome
	cout << "Merge boundaries with TSSs and PASs" << endl;
	for_each_ele_in_group(iter, map_str2vec_gene, genes)
	{
		const string& chr = iter->first;
		vector<Gene>& curr_genes = iter->second;

		if (boundaries.find(chr) == boundaries.end())
		{
			vector<Boundary> empty_vec;
			boundaries[chr] = empty_vec;
		}

		map_int642int bar_set;

		vector<Boundary>& curr_bounds = boundaries[chr];

		for (unsigned i = 0; i < curr_genes.size(); i++)
		{
			Gene& a_gene = curr_genes[i];

			for (unsigned j = 0; j < a_gene.mExons.size(); j++)
			{
				int64 pos = a_gene.mExons[j].mStart;
				int type = 0;
				if (bar_set.find(pos) == bar_set.end())
					bar_set[pos] = type;
				else
				{
					if (bar_set[pos] != type)
						bar_set[pos] = 2;
				}

				pos = a_gene.mExons[j].mEnd;
				type = 1;
				if (bar_set.find(pos) == bar_set.end())
					bar_set[pos] = type;
				else
				{
					if (bar_set[pos] != type)
						bar_set[pos] = 2;
				}
			}
			for (unsigned j = 0; j < a_gene.mTSSs.size(); j++)
			{
				for (unsigned k = 0; k < a_gene.mTSSs[j].size(); k++)
				{
					int64 pos = a_gene.mTSSs[j][k];
					int type = 0;
					if (bar_set.find(pos) == bar_set.end())
						bar_set[pos] = type;
					else
					{
						if (bar_set[pos] != type)
							bar_set[pos] = 2;
					}
				}
				for (unsigned k = 0; k < a_gene.mPASs[j].size(); k++)
				{
					int64 pos = a_gene.mPASs[j][k];
					int type = 1;
					if (bar_set.find(pos) == bar_set.end())
						bar_set[pos] = type;
					else
					{
						if (bar_set[pos] != type)
							bar_set[pos] = 2;
					}
				}
			}
		}

		curr_bounds.resize(bar_set.size());
		int i = 0;
		for_each_ele_in_group(iter, map_int642int, bar_set)
		{
			Boundary& bound = curr_bounds[i++];
			bound.mPos = iter->first;
			bound.mType = iter->second;
		}

		sort(curr_bounds.begin(), curr_bounds.end());
	}

	cout << "Build all expressed segments" << endl;
	int cnt = 0;
	for_each_ele_in_group(iter, map_str2vec_boundary, boundaries)
	{
		string chr = iter->first;
		bool strand = true;
		if (mbStrandSpecific)
			strand = (chr.substr(chr.length()-1, chr.length()) == "+");

		vector<Boundary>& curr_bounds = iter->second;

		vector<Exon> empty_vec;
		exons[chr] = empty_vec;

		vector<Exon>& curr_exons = exons[chr];
		curr_exons.resize(curr_bounds.size()-1);

		for (unsigned i = 1; i < curr_bounds.size(); i++)
		{
			// Note that every segment could be a part of an exon
			Exon& exon = curr_exons[i-1];
			exon.mStart = curr_bounds[i-1].mPos;
			exon.mEnd = curr_bounds[i].mPos;
			exon.mStartType = curr_bounds[i-1].mType;
			exon.mEndType = curr_bounds[i].mType;
			exon.mChr = chr;
			exon.mStrand = strand;
			exon.mBothCnt = 0;
			exon.mStartCnt = 0;
			exon.mEndCnt = 0;
		}
		cnt += curr_exons.size();
	}
	cout << cnt << " expressed segments have been build" << endl;

	return true;
}		/* -----  end of method DataProcessor::LoadGeneAndExon  ----- */



/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  LoadGeneAndExon
 *  Description:  Load exon-intron boundaries from a file
 *        Param:  noise_level  :  In RPKM. Segments with expression levels below this
 *                    are considered as exons. 
 *       Return:
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::LoadGeneAndExon(string boundary_file_name, string gene_file_name, 
							   string tss_pas_file_name, map_str2vec_gene& genes, map_str2vec_exon& exons)
{
	if (!LoadGenes(gene_file_name, genes)) return false;
	if (!LoadTSSPAS(tss_pas_file_name, genes)) return false;

	map_str2vec_boundary boundaries;
	if (!LoadBoundary(boundary_file_name, boundaries)) return false;

	map_str2set_int all_tss;
	map_str2set_int all_pas;

	// Extract all the boundaries including TSSs and PASs on each chromosome
	cout << "Merge boundaries with TSSs and PASs" << endl;
	for_each_ele_in_group(iter, map_str2vec_gene, genes)
	{
		const string& chr = iter->first;
		vector<Gene>& curr_genes = iter->second;

		if (all_tss.find(chr) == all_tss.end())
		{
			set<int> empty_set;
			all_tss[chr] = empty_set;
			all_pas[chr] = empty_set;
		}
		if (boundaries.find(chr) == boundaries.end())
		{
			vector<Boundary> empty_vec;
			boundaries[chr] = empty_vec;
		}

		map_int642int bar_set;

		vector<Boundary>& curr_bounds = boundaries[chr];
		for (unsigned i = 0; i < curr_bounds.size(); i++)
			bar_set[curr_bounds[i].mPos] = curr_bounds[i].mType;

		for (unsigned i = 0; i < curr_genes.size(); i++)
		{
			Gene& a_gene = curr_genes[i];

			for (unsigned j = 0; j < a_gene.mTSSs.size(); j++)
			{
				for (unsigned k = 0; k < a_gene.mTSSs[j].size(); k++)
				{
					int64 pos = a_gene.mTSSs[j][k];
					int type = 0;
					if (bar_set.find(pos) == bar_set.end())
						bar_set[pos] = type;
					else
					{
						if (bar_set[pos] != type)
							bar_set[pos] = 2;
					}
				}
				for (unsigned k = 0; k < a_gene.mPASs[j].size(); k++)
				{
					int64 pos = a_gene.mPASs[j][k];
					int type = 1;
					if (bar_set.find(pos) == bar_set.end())
						bar_set[pos] = type;
					else
					{
						if (bar_set[pos] != type)
							bar_set[pos] = 2;
					}
				}
			}
		}

		curr_bounds.resize(bar_set.size());
		int i = 0;
		for_each_ele_in_group(iter, map_int642int, bar_set)
		{
			Boundary& bound = curr_bounds[i++];
			bound.mPos = iter->first;
			bound.mType = iter->second;
		}

		sort(curr_bounds.begin(), curr_bounds.end());
	}

	cout << "Build all expressed segments" << endl;
	int cnt = 0;
	for_each_ele_in_group(iter, map_str2vec_boundary, boundaries)
	{
		string chr = iter->first;
		bool strand = true;
		if (mbStrandSpecific)
			strand = (chr.substr(chr.length()-1, chr.length()) == "+");

		vector<Boundary>& curr_bounds = iter->second;

		vector<Exon> empty_vec;
		exons[chr] = empty_vec;

		vector<Exon>& curr_exons = exons[chr];
		curr_exons.resize(curr_bounds.size()-1);

		for (unsigned i = 1; i < curr_bounds.size(); i++)
		{
			// Note that every segment could be a part of an exon
			Exon& exon = curr_exons[i-1];
			exon.mStart = curr_bounds[i-1].mPos;
			exon.mEnd = curr_bounds[i].mPos;
			exon.mStartType = curr_bounds[i-1].mType;
			exon.mEndType = curr_bounds[i].mType;
			exon.mChr = chr;
			exon.mStrand = strand;
			exon.mBothCnt = 0;
			exon.mStartCnt = 0;
			exon.mEndCnt = 0;
		}
		cnt += curr_exons.size();
	}
	cout << cnt << " expressed segments have been build" << endl;

	return true;
}		/* -----  end of method DataProcessor::LoadGeneAndExon  ----- */

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  GroupGeneAndExon
 *  Description:  Given exons and the boundaries of genes, group genes and exons such 
 *                that if two gene share one exon, the two gene have the same color. 
 *                Exons that belong to genes who have the same color will be assigned
 *                the same color.
 *        Param:  The genes and exons should be sorted according to their start positions
 *                on each chromosome.
 *       Return:
 *--------------------------------------------------------------------------------------
 */
void
DataProcessor::GroupGeneAndExon(map_str2vec_gene& genes, map_str2vec_int& gene_color,
								map_str2vec_exon& exons, map_str2vec_int& exon_color)
{
	cout << "Grouping genes and exons" << endl;

	// Remove exons on chromosomes on which no genes exists.
	set<string> empty_chr;
	for_each_ele_in_group(iter, map_str2vec_exon, exons)
		if (genes.find(iter->first) == genes.end()) empty_chr.insert(iter->first);
	for_each_ele_in_group(iter, set<string>, empty_chr)
		exons.erase(*iter);

	for_each_ele_in_group(iter, map_str2vec_gene, genes)
	{
		string chr = iter->first;
		vector<Gene>& curr_genes = iter->second;
		vector<Exon>& curr_exons = exons[chr];

		vector<int> empty_vec;
		gene_color[chr] = empty_vec;
		exon_color[chr] = empty_vec;

		// The exons should not overlap
		vector<int>& curr_gene_color = gene_color[chr];
		vector<int>& curr_exon_color = exon_color[chr];

		curr_gene_color.resize(curr_genes.size());
		curr_exon_color.resize(curr_exons.size());

		for (unsigned i = 0; i < curr_gene_color.size(); i++)
			curr_gene_color[i] = i;
		for (unsigned i = 0; i < curr_exon_color.size(); i++)
			curr_exon_color[i] = -1;

		vector<bool> b_visited;
		b_visited.assign(curr_genes.size(), false);

		int gene_idx = 0;
		for (unsigned i = 0; i < curr_exons.size(); i++)
		{
			while (gene_idx < curr_genes.size() && curr_exons[i].mStart >= curr_genes[gene_idx].mEnd) gene_idx++;
			if (gene_idx == curr_genes.size()) break;
			b_visited[gene_idx] = true;
			// An exon out of genes
			if (curr_exons[i].mStart < curr_genes[gene_idx].mStart) continue;
			curr_exon_color[i] = curr_gene_color[gene_idx];
			for (unsigned j = gene_idx + 1; j < curr_genes.size(); j++)
			{
				if (curr_exons[i].mStart >= curr_genes[j].mStart)
				{
					b_visited[j] = true;
					curr_gene_color[j] = curr_gene_color[gene_idx];
				}
				else
					break;
			}
		}

		cout << "On chromosome " << chr << endl;
		// Output information
		for (unsigned i = 0; i < curr_exon_color.size(); i++)
			if (-1 == curr_exon_color[i])
			{
				cout << "Expressed segement (" << curr_exons[i].mStart << "," 
					 << curr_exons[i].mEnd << ") is out of genes on chromosome " << chr << endl;
			}
		for (unsigned gene_idx = 0; gene_idx < curr_genes.size(); gene_idx++)
		{
			int old_idx = gene_idx;
			while (gene_idx+1 < curr_genes.size() && 
					curr_gene_color[gene_idx] == curr_gene_color[gene_idx+1])
				gene_idx++;
			if (!b_visited[gene_idx]) continue;
			cout << "Genes : ";
			for (int i = old_idx; i <= gene_idx; i++)
				cout << curr_genes[i].mName << ",";
			cout << " are grouped together" << endl;
		}
		for (unsigned gene_idx = 0; gene_idx < curr_genes.size(); gene_idx++)
			if (!b_visited[gene_idx])
			{
				curr_gene_color[gene_idx] = -1;
				cout << "Gene " << curr_genes[gene_idx].mName << " has no expressed segments" << endl;
			}
	}
}		/* -----  end of method DataProcessor::GroupGeneAndExon  ----- */

/*
 *--------------------------------------------------------------------------------------
 *       Class:  DataProcessor
 *      Method:  ExtractJunctionRef
 * Description:  Extract all junctions in given isoforms
 *       Param:  start_pos_of_first_nt : The position of the first neocliotide on a chromosome.
 *      Return:
 *--------------------------------------------------------------------------------------
 */
	bool
DataProcessor::ExtractJunctionRef(string tran_file, string& refseq_file, int start_pos_of_first_nt)
{
	// cross_len defines the strength of overlap. More specifically,
	// the junction requires at least 'cross_strength' bases
	// are aligned on both sides. Or, each exon contributes cross_len bases
	// to the junction ref.
	int cross_len = mReadLen - mCrossStrength;

	if (2 * cross_len < mReadLen)
		cerr << "WARNING : read is too short compared to the required cross strengh" << endl;

	cout << "Extracting known junction ref sequences from isoforms." << endl;

	map_str2vec_gene genes;
	if (!MishMash::LoadTranscripts(tran_file, genes, mbStrandSpecific))
		return false;

	map<string, map_int_str> pos2refseq;

	list<int> first_seg, second_seg;
	list<string> chromosome;

	for_each_ele_in_group(iter, map_str2vec_gene, genes)
	{
		string curr_chr = iter->first;

		vector<Gene>& curr_genes = iter->second;

		map_int_str empty_map;
		pos2refseq[curr_chr] = empty_map;

		set<int64> junctions;
		for (unsigned i = 0; i < curr_genes.size(); i++)
		{
			vector<Exon>& exons = curr_genes[i].mExons;
			for (unsigned j = 0; j < exons.size()-1; j++)
			{
				int64 idx = Utility::combine64(exons[j].mEnd, exons[j+1].mStart);
				junctions.insert(idx);
			}
		}

		map_int_str& a_map = pos2refseq[curr_chr];

		for_each_ele_in_group(iter, set<int64>, junctions)
		{
			int end = Utility::get_combined64_first(*iter);
			int pos = end - cross_len;
			first_seg.push_back(pos);
			a_map[pos] = "";

			pos = Utility::get_combined64_second(*iter);
			second_seg.push_back(pos);
			a_map[pos] = "";

			chromosome.push_back(curr_chr);
		}
	}

	if (!ExtractRef(refseq_file, pos2refseq, cross_len, start_pos_of_first_nt)) return false;

	// Output the segments
	list<int>::iterator iter_first = first_seg.begin();
	list<int>::iterator iter_second = second_seg.begin();
	list<string>::iterator iter_chr = chromosome.begin();

	cout << "Total " << first_seg.size() << " junctions have been generated. Writing output" << endl;

	/* :WARNING:03/10/2009 03:01:27 PM:feeldead:  */
	// The junction refseq generated in this way is not exactly right
	// When the length of the exon is less than the length of the reads
	// the junction refseq may be impossible to be constructed.
	while (iter_first != first_seg.end())
	{
		if (pos2refseq.find(*iter_chr) != pos2refseq.end())
		{
			map_int_str& a_map = pos2refseq[*iter_chr];

			const char* cont_first = a_map[*iter_first].data();
			const char* cont_second = a_map[*iter_second].data();

			// Output in the form which is compatible with LoadJunctionReads
			string chr = *iter_chr;
			if (mbStrandSpecific)
			{
				// Remove the strand mark and the end.
				chr = chr.substr(0, chr.length()-1);
			}
			(*mpOutput) << ">" << chr << "|" << *iter_first << "|" << *iter_second << "|" << cross_len << "|Junc" << endl;
			// Becareful about the start index here
			for (int i = 0; i < cross_len; i++)
				(*mpOutput) << cont_first[i];
			// Becareful about the end index here
			for (int i = 0; i < cross_len; i++)
				(*mpOutput) << cont_second[i];
			(*mpOutput) << endl;
		}

		iter_first++;
		iter_second++;
		iter_chr++;
	}

	cout << "Done" << endl;

	return true;
}		/* -----  end of method DataProcessor::ExtractJunctionRef  ----- */


/*
 *--------------------------------------------------------------------------------------
 *       Class:  DataProcessor
 *      Method:  ExtractJunctionRef
 * Description:  Given grouped exons and the refseq, this method extract the junction
 *               refseq.
 *       Param:  start_pos_of_first_nt : The position of the first neocliotide on a chromosome.
 *      Return:
 *--------------------------------------------------------------------------------------
 */
	bool
DataProcessor::ExtractJunctionRef(map_str2vec_exon& exons, map_str2vec_int& exon_color, 
								  map_str2vec_gene& genes, map_str2vec_int& gene_color, 
								  string& refseq_file, int start_pos_of_first_nt)
{
	// cross_len defines the strength of overlap. More specifically,
	// the junction requires at least 'cross_strength' bases
	// are aligned on both sides. Or, each exon contributes cross_len bases
	// to the junction ref.
	int cross_len = mReadLen - mCrossStrength;

	if (2 * cross_len < mReadLen)
		cerr << "WARNING : read is too short compared to the required cross strengh" << endl;

	cout << "Extracting junction ref sequences." << endl;

	map<string, map_int_str> pos2refseq;

	list<int> first_seg, second_seg;
	list<string> chromosome;

	for_each_ele_in_group(iter, map_str2vec_exon, exons)
	{
		string curr_chr = iter->first;

		vector<Exon>& curr_exons = iter->second;
		vector<int>& curr_exon_color = exon_color[curr_chr];

		vector<Gene>& curr_genes = genes[curr_chr];
		vector<int>& curr_gene_color = gene_color[curr_chr];

		// For each color, find the start gene idx and end gene idx
		vector<int> start_gene_idx_indexed_by_color;
		vector<int> end_gene_idx_indexed_by_color;
		start_gene_idx_indexed_by_color.assign(curr_gene_color.size(), curr_gene_color.size());
		end_gene_idx_indexed_by_color.assign(curr_gene_color.size(), 0);
		for (unsigned gene_idx = 0; gene_idx < curr_gene_color.size(); gene_idx++)
		{
			if (-1 == curr_gene_color[gene_idx]) continue;
			if (start_gene_idx_indexed_by_color[curr_gene_color[gene_idx]] > gene_idx)
				start_gene_idx_indexed_by_color[curr_gene_color[gene_idx]] = gene_idx;
			if (end_gene_idx_indexed_by_color[curr_gene_color[gene_idx]] < gene_idx)
				end_gene_idx_indexed_by_color[curr_gene_color[gene_idx]] = gene_idx;
		}

		for (unsigned exon_idx = 0; exon_idx < curr_exons.size(); exon_idx++)
		{
			unsigned old_idx = exon_idx;
			while (exon_idx+1 < curr_exons.size() && 
					curr_exon_color[exon_idx] == curr_exon_color[exon_idx+1])
				exon_idx++;

			if (-1 == curr_exon_color[exon_idx]) continue;

			if (pos2refseq.find(curr_chr) == pos2refseq.end())
			{
				map_int_str a_map;
				pos2refseq[curr_chr] = a_map;
			}

			int start_gene_idx = start_gene_idx_indexed_by_color[curr_exon_color[exon_idx]];
			int end_gene_idx = end_gene_idx_indexed_by_color[curr_exon_color[exon_idx]];

			map_int_str& a_map = pos2refseq[curr_chr];

			// Any two different exons in the involved exons may form a junction.
			for (unsigned first_idx = old_idx; first_idx <= exon_idx ; first_idx++)
			{
				for (unsigned second_idx = first_idx + 1; second_idx <= exon_idx ; second_idx++)
				{
					if (curr_exons[first_idx].mEnd == curr_exons[second_idx].mStart ||
					    (0 == curr_exons[first_idx].mEndType || 1 == curr_exons[second_idx].mStartType))
						continue;

					// if curr two exons do not belong to the same gene, skip
					bool b_succ = false;
					for (int gene_idx = start_gene_idx; gene_idx <= end_gene_idx; gene_idx++)
					{
						if (curr_exons[first_idx].mStart >= curr_genes[gene_idx].mStart &&
						    curr_exons[first_idx].mEnd <= curr_genes[gene_idx].mEnd &&
						    curr_exons[second_idx].mStart >= curr_genes[gene_idx].mStart &&
						    curr_exons[second_idx].mEnd <= curr_genes[gene_idx].mEnd)
						{
							b_succ = true;
							break;
						}
					}

					if (!b_succ) continue;

					int pos = curr_exons[first_idx].mEnd-cross_len;
					first_seg.push_back(pos);
					a_map[pos] = "";

					pos = curr_exons[second_idx].mStart;
					second_seg.push_back(pos);
					a_map[pos] = "";

					chromosome.push_back(curr_chr);
				}
			}
		}
	}

	if (!ExtractRef(refseq_file, pos2refseq, cross_len, start_pos_of_first_nt)) return false;

	// Output the segments
	list<int>::iterator iter_first = first_seg.begin();
	list<int>::iterator iter_second = second_seg.begin();
	list<string>::iterator iter_chr = chromosome.begin();

	cout << "Total " << first_seg.size() << " junctions have been generated. Writing output" << endl;

	/* :WARNING:03/10/2009 03:01:27 PM:feeldead:  */
	// The junction refseq generated in this way is not exactly right
	// When the length of the exon is less than the length of the reads
	// the junction refseq may be impossible to be constructed.
	while (iter_first != first_seg.end())
	{
		if (pos2refseq.find(*iter_chr) != pos2refseq.end())
		{
			map_int_str& a_map = pos2refseq[*iter_chr];
			if (a_map[*iter_first].size() < cross_len)
			{
				cerr << "ERROR : on chromosome " << *iter_chr << ", position " << *iter_first << " is out of range" << endl;
				continue;
			}
			if (a_map[*iter_second].size() < cross_len)
			{
				cerr << "ERROR : on chromosome " << *iter_chr << ", position " << *iter_second << " is out of range" << endl;
				continue;
			}

			const char* cont_first = a_map[*iter_first].data();
			const char* cont_second = a_map[*iter_second].data();

			// Output in the form which is compatible with LoadJunctionReads
			string chr = *iter_chr;
			if (mbStrandSpecific)
			{
				// Remove the strand mark and the end.
				chr = chr.substr(0, chr.length()-1);
			}
			(*mpOutput) << ">" << chr << "|" << *iter_first << "|" << *iter_second << "|" << cross_len << "|Junc" << endl;
			// Becareful about the start index here
			for (int i = 0; i < cross_len; i++)
				(*mpOutput) << cont_first[i];
			// Becareful about the end index here
			for (int i = 0; i < cross_len; i++)
				(*mpOutput) << cont_second[i];
			(*mpOutput) << endl;
		}

		iter_first++;
		iter_second++;
		iter_chr++;
	}

	cout << "Done" << endl;

	return true;
}		/* -----  end of method DataProcessor::ExtractJunctionRef  ----- */


/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor 
 *       Method:  ExtractInstances
 *  Description:  Extract instances from given grouped genes and exons. Introns will be
 *                defined by given exons of each isoform or expression level. Please see
 *                Help() for more information.
 *        Param:  
 *       Return:
 *--------------------------------------------------------------------------------------
 */
	bool
DataProcessor::ExtractInstances(map_str2vec_gene& genes, map_str2vec_int& gene_color,
								map_str2vec_exon& exons, map_str2vec_int& exon_color,
								map<string, map_64_double>& junc_counts, vector<Instance>& all_instances,
								bool b_use_provided_exons, double noise_level)
{
	double all_single_end_read_cnt = 0;
	for_each_ele_in_group(iter, map_str2vec_exon, exons)
	{
		vector<Exon>& av = iter->second;
		for (unsigned i = 0; i < av.size(); i++)
			all_single_end_read_cnt += av[i].mStartCnt;
	}

	int cnt = 0;
	int prob_cnt = 0;

	for_each_ele_in_group(iter, map_str2vec_exon, exons)
	{
		string curr_chr = iter->first;
		vector<Exon>& curr_exons = iter->second;
		vector<int>& curr_exon_color = exon_color[curr_chr];

		if (genes.find(curr_chr) == genes.end()) continue;

		vector<Gene>& curr_genes = genes[curr_chr];
		vector<int>& curr_gene_color = gene_color[curr_chr];
		map_64_double& junc_counts_on_curr_chr = junc_counts[curr_chr];


		if (curr_genes.size() == 0) continue;

		int gene_idx = 0;

		for (unsigned exon_idx = 0; exon_idx < curr_exons.size(); exon_idx++)
		{
			unsigned old_exon_idx = exon_idx;
			while (exon_idx+1 < curr_exons.size() && 
					curr_exon_color[exon_idx] == curr_exon_color[exon_idx+1])
				exon_idx++;
			
			if (-1 == curr_exon_color[exon_idx]) continue;

			prob_cnt++;
			all_instances.resize(prob_cnt);
			Instance& an_instance = all_instances[prob_cnt-1];

			an_instance.mInstanceCnt = prob_cnt;
			an_instance.mKnownCnt = 0;
			an_instance.mComments = "";
			an_instance.mNoiseLevel = 0;
			an_instance.mReadLen = mReadLen;
			an_instance.mReadCnt = (int)all_single_end_read_cnt;
			an_instance.mCrossStrength = mCrossStrength;
			

			vector<Exon>& exons_of_this_inst = an_instance.mExons;
			for (int i = old_exon_idx; i <= exon_idx; i++)
				exons_of_this_inst.push_back(curr_exons[i]);

			while (curr_gene_color[gene_idx] < curr_exon_color[exon_idx]) gene_idx++;
			int old_gene_idx = gene_idx;
			while (gene_idx < curr_gene_color.size() && 
					curr_gene_color[gene_idx] == curr_exon_color[exon_idx]) gene_idx++;
			assert(old_gene_idx < gene_idx);

			vector<Gene>& genes_of_this_inst = an_instance.mGenes;
			genes_of_this_inst.resize(gene_idx - old_gene_idx);
			for (unsigned i = 0; i < genes_of_this_inst.size(); i++)
				genes_of_this_inst[i] = curr_genes[i+old_gene_idx];

			// Find start and end exons
			vector<vector<int> >& all_start_exons = an_instance.mStartExons;
			vector<vector<int> >& all_end_exons = an_instance.mEndExons;
			for (int i = old_gene_idx; i < gene_idx; i++)
			{
				vector<vector<int64> >& TSSs = curr_genes[i].mTSSs;
				vector<vector<int64> >& PASs = curr_genes[i].mPASs;

				for (int j = 0; j < TSSs.size(); j++)
				{
					vector<int> start_exons;
					vector<int> end_exons;
					for (int k = 0; k < TSSs[j].size(); k++)
					{
						int tss = TSSs[j][k];
						// Find the corresponding exon
						int start = old_exon_idx;
						for (; start <= exon_idx; start++)
							if (tss == curr_exons[start].mStart) break;
						if (start <= exon_idx)
							start_exons.push_back(start - old_exon_idx);
					}
					for (int k = 0; k < PASs[j].size(); k++)
					{
						int pas = PASs[j][k];
						// Find the corresponding exon
						int end = old_exon_idx;
						for (; end <= exon_idx; end ++)
							if (pas == curr_exons[end].mEnd) break;
						if (end <= exon_idx)
							end_exons.push_back(end - old_exon_idx);
					}

					if (start_exons.size() > 0 && end_exons.size() > 0)
					{
						bool b_exist = false;
						for (int k = 0; k < all_start_exons.size(); k++)
							if (start_exons == all_start_exons[k] && end_exons == all_end_exons[k])
							{
								b_exist = true;
								break;
							}
						if (!b_exist)
						{
							all_start_exons.push_back(start_exons);
							all_end_exons.push_back(end_exons);
						}
					}
				}
			}

			// Construct known isoforms
			vector<vector<bool> >& isoforms = an_instance.mIsoforms;

			bool b_exons_provided = false;
			if (curr_genes[old_gene_idx].mExons.size() > 0)
				b_exons_provided = true;

			if (b_exons_provided)
			{
				isoforms.resize(gene_idx - old_gene_idx);
				for (unsigned i = 0; i < isoforms.size(); i++)
				{
					Gene& a_gene = curr_genes[i+old_gene_idx];
					vector<bool> an_iso;
					an_iso.assign(exons_of_this_inst.size(), false);
					for (unsigned j = 0; j < an_iso.size(); j++)
					{
						bool b_exist = false;
						for (unsigned k = 0; k < a_gene.mExons.size(); k++)
							if (a_gene.mExons[k].mStart <= exons_of_this_inst[j].mStart &&
								a_gene.mExons[k].mEnd >= exons_of_this_inst[j].mEnd)
							{
								b_exist = true;
								break;
							}
						an_iso[j] = b_exist;
					}
					isoforms[i] = an_iso;
				}
				an_instance.mKnownCnt = isoforms.size();
				an_instance.mIsoExp.resize(isoforms.size(), 0);
			}

			// Remove introns. If known isoforms are provided, introns are defined as segments that
			// have not been covered by known isoforms. If known isoforms are not provided, introns
			// are segments that have low expression levels. No start/end segments could be introns
			//
			// Calculate which segments are introns
			vector<bool> b_intron;
			b_intron.assign(exons_of_this_inst.size(), true);
			for (unsigned i = 0; i < all_start_exons.size(); i++)
			{
				for (unsigned j = 0; j < all_start_exons[i].size(); j++)
					b_intron[all_start_exons[i][j]] = false;
				for (unsigned j = 0; j < all_end_exons[i].size(); j++)
					b_intron[all_end_exons[i][j]] = false;
			}

			if (b_exons_provided && b_use_provided_exons)
			{
				for (unsigned i = 0; i < isoforms.size(); i++)
					for (unsigned j = 0; j < isoforms[i].size(); j++)
						if (isoforms[i][j]) b_intron[j] = false;
			}
			else
			{
				for (unsigned i = 0; i < exons_of_this_inst.size(); i++)
				{
					Exon& exon = exons_of_this_inst[i];
					double read_cnt = exon.mStartCnt;
					if (read_cnt < exon.mEndCnt) read_cnt = exon.mEndCnt;

					double curr_junc_cnt = 0;
					for (unsigned j = i+1; j < exons_of_this_inst.size(); j++)
					{
						int exon1 = old_exon_idx + i;
						int exon2 = old_exon_idx + j;
						int64 idx = Utility::combine64(exon1, exon2);
						if (junc_counts_on_curr_chr.find(idx) != junc_counts_on_curr_chr.end())
							curr_junc_cnt += junc_counts_on_curr_chr[idx];
					}

					if (read_cnt < curr_junc_cnt) read_cnt = curr_junc_cnt;

					int length = exon.mEnd - exon.mStart;

					// In RPKM
					double exp = read_cnt * 1000000.0 / all_single_end_read_cnt * 1000.0 / length;

					if (exp > noise_level) b_intron[i] = false;
				}
			}

			// Remove introns. 
			vector<int> new2old_idx;
			vector<int> old2new_idx;
			new2old_idx.resize(b_intron.size());
			old2new_idx.assign(b_intron.size(), -1);
			int new_size = 0;
			for (unsigned i = 0; i < b_intron.size(); i++)
			{
				if (!b_intron[i])
				{
					old2new_idx[i] = new_size;
					new2old_idx[new_size++] = i;
				}
				else
				{
					cout << "Intron :" << exons_of_this_inst[i].mStart << "," 
						 << exons_of_this_inst[i].mEnd << " is removed" << endl;
				}
			}
			new2old_idx.resize(new_size);

			//assert(new_size > 0);
			// When no TSS/PAS information is provided, it is possible that all the 
			// segments on a gene are introns
			if (new_size == 0)
			{
				prob_cnt--;
				continue;
			}

			if (new_size != b_intron.size())
			{
				for (unsigned i = 0; i < new_size; i++)
					exons_of_this_inst[i] = exons_of_this_inst[new2old_idx[i]];
				exons_of_this_inst.resize(new_size);
				for (unsigned i = 0; i < all_start_exons.size(); i++)
				{
					for (unsigned j = 0; j < all_start_exons[i].size(); j++)
						all_start_exons[i][j] = old2new_idx[all_start_exons[i][j]];
					for (unsigned j = 0; j < all_end_exons[i].size(); j++)
						all_end_exons[i][j] = old2new_idx[all_end_exons[i][j]];
				}
				for (unsigned i = 0; i < isoforms.size(); i++)
				{
					vector<bool>& an_iso = isoforms[i];

					// Check whether the intron removing is consistent with known isoforms
					bool b_consistent = true;
					for (unsigned j = 0; j < an_iso.size(); j++)
						if (an_iso[j] && b_intron[j])
						{
							cerr << "Intron removing is inconsistent with known isoform " << genes_of_this_inst[i].mName << endl;
							b_consistent = false;
							break;
						}

					// Set the isoform as empty such that no prediction will matched to it.
					if (!b_consistent)  
						an_iso.assign(new_size, false);
					else
					{
						for (unsigned j = 0; j < new_size; j++)
							an_iso[j] = an_iso[new2old_idx[j]];
						an_iso.resize(new_size);
					}
				}
			}

			vector<int>& set_sizes = an_instance.mSetSizes;
			vector<vector<double> >& splice_read_cnt = an_instance.mSpliceReadCnt;
			vector<double>& sample_cnt = an_instance.mSampleCnt;

			set_sizes.resize(new_size);
			sample_cnt.resize(set_sizes.size());
			for (unsigned i = 0; i < exons_of_this_inst.size(); i++)
			{
				set_sizes[i] = exons_of_this_inst[i].mEnd - exons_of_this_inst[i].mStart;
				sample_cnt[i] = exons_of_this_inst[i].mBothCnt;
			}

			splice_read_cnt.resize(set_sizes.size());
			for (int i = 0; i < splice_read_cnt.size(); i++)
				splice_read_cnt[i].assign(set_sizes.size(), 0);

			// extract the read counts that on the junctions (splice site)
			if (junc_counts.find(curr_chr) != junc_counts.end())
			{
				for (int i = 0; i < new_size; i++)
				{
					for (int j = i+1; j < new_size; j++)
					{
						int exon1 = old_exon_idx + new2old_idx[i];
						int exon2 = old_exon_idx + new2old_idx[j];
						int64 idx = Utility::combine64(exon1, exon2);
						if (junc_counts_on_curr_chr.find(idx) != junc_counts_on_curr_chr.end())
							splice_read_cnt[i][j] = junc_counts_on_curr_chr[idx];
					}
				}
			}
		}
	}

	all_instances.resize(prob_cnt);

	return true;
}		/* -----  end of method DataProcessor::ExtractInstances  ----- */

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor 
 *       Method:  HighIsoforms
 *  Description:  Given a set of isoforms and mapping information, find out highly expressed
 *                isoforms.
 *        Param:  
 *       Return:
 *--------------------------------------------------------------------------------------
 */
	bool
DataProcessor::HighIsoforms(map_str2vec_gene& genes, map_str2vec_int& gene_color,
							map_str2vec_exon& exons, map_str2vec_int& exon_color, 
							map<string, map_64_double>& junc_counts, bool b_check_start_end)
{
	double all_single_end_read_cnt = 0;
	for_each_ele_in_group(iter, map_str2vec_exon, exons)
	{
		vector<Exon>& av = iter->second;
		for (unsigned i = 0; i < av.size(); i++)
			all_single_end_read_cnt += av[i].mStartCnt;
	}

	int cnt = 0;
	int prob_cnt = 0;

	for_each_ele_in_group(iter, map_str2vec_exon, exons)
	{
		string curr_chr = iter->first;
		vector<Exon>& curr_exons = iter->second;
		vector<int>& curr_exon_color = exon_color[curr_chr];

		if (genes.find(curr_chr) == genes.end()) continue;

		vector<Gene>& curr_genes = genes[curr_chr];
		vector<int>& curr_gene_color = gene_color[curr_chr];

		if (curr_genes.size() == 0) continue;

		int gene_idx = 0;

		for (unsigned exon_idx = 0; exon_idx < curr_exons.size(); exon_idx++)
		{
			unsigned old_exon_idx = exon_idx;
			while (exon_idx+1 < curr_exons.size() && 
					curr_exon_color[exon_idx] == curr_exon_color[exon_idx+1])
				exon_idx++;
			
			if (-1 == curr_exon_color[exon_idx]) continue;

			vector<Exon> exons_of_this_inst;
			for (int i = old_exon_idx; i <= exon_idx; i++)
				exons_of_this_inst.push_back(curr_exons[i]);

			while (curr_gene_color[gene_idx] < curr_exon_color[exon_idx]) gene_idx++;
			int old_gene_idx = gene_idx;
			while (gene_idx < curr_gene_color.size() && 
					curr_gene_color[gene_idx] == curr_exon_color[exon_idx]) gene_idx++;
			assert(old_gene_idx < gene_idx);

			vector<vector<double> > splice_read_cnt;

			int exon_cnt = exons_of_this_inst.size();

			splice_read_cnt.resize(exon_cnt);
			for (int i = 0; i < splice_read_cnt.size(); i++)
				splice_read_cnt[i].assign(exon_cnt, 0);

			// extract the read counts that on the junctions (splice site)
			if (junc_counts.find(curr_chr) != junc_counts.end())
			{
				map_64_double& junc_counts_on_curr_chr = junc_counts[curr_chr];

				for (int i = 0; i < exon_cnt; i++)
				{
					for (int j = i+1; j < exon_cnt; j++)
					{
						int exon1 = old_exon_idx + i;
						int exon2 = old_exon_idx + j;
						int64 idx = Utility::combine64(exon1, exon2);
						if (junc_counts_on_curr_chr.find(idx) != junc_counts_on_curr_chr.end())
							splice_read_cnt[i][j] = junc_counts_on_curr_chr[idx];
					}
				}
			}

			// Make sure that every exon in exons_of_this_inst is a sub-interval
			// of or does not overlap with this isoform.
			int iso_cnt = gene_idx - old_gene_idx;
			for (unsigned i = 0; i < iso_cnt; i++)
			{
				vector<Exon>& orig_exons = curr_genes[i+old_gene_idx].mExons;

				bool b_high = true;
				double min_exp = 100000;

				vector<bool> an_iso;
				an_iso.assign(exon_cnt, false);
				for (unsigned j = 0; j < an_iso.size(); j++)
				{
					bool b_exist = false;
					for (unsigned k = 0; k < orig_exons.size(); k++)
						if (orig_exons[k].mStart <= exons_of_this_inst[j].mStart &&
							orig_exons[k].mEnd >= exons_of_this_inst[j].mEnd)
						{
							b_exist = true;
							break;
						}
					an_iso[j] = b_exist;
				}

				int start = -1;
				for (unsigned j = 0; j < an_iso.size(); j++)
				{
					if (an_iso[j])
					{
						if (-1 != start)
						{
							// Check junction
							if (exons_of_this_inst[start].mEnd != exons_of_this_inst[j].mStart)
							{
								if (splice_read_cnt[start][j] <= 0)
								{
									b_high = false;
									break;
								}
								else
								{
									double exp = splice_read_cnt[start][j] * 1000000.0 / all_single_end_read_cnt * 
												 1000.0 / (mReadLen - mCrossStrength + 1);
									if (min_exp > exp) min_exp = exp;
								}
							}
						}
						start = j;
					}
				}

				if (b_high)
				{
					int start_idx = 0;
					int end_idx = orig_exons.size();
					if (!b_check_start_end)
					{
						start_idx++;
						end_idx--;
					}
					// Does not check the first and the last exon
					for (int j = start_idx; j < end_idx; j++)
					{
						Exon& an_exon = orig_exons[j];

						// Check expression level
						int read_cnt = 0;
						int length = 0;
						for (unsigned k = 0; k < exons_of_this_inst.size(); k++)
							if (an_exon.mStart <= exons_of_this_inst[k].mStart &&
								an_exon.mEnd >= exons_of_this_inst[k].mEnd)
							{
								read_cnt += exons_of_this_inst[k].mStartCnt;
								length += exons_of_this_inst[k].mEnd - exons_of_this_inst[k].mStart;
								length -= mCrossStrength - 1;
							}

						if (length < 0) continue;

						double exp = read_cnt * 1000000.0 / all_single_end_read_cnt * 1000.0 / length;

						if (min_exp > exp)
							min_exp = exp;
					}

					(*mpOutput) << min_exp << "\t";
					curr_genes[i+old_gene_idx].Write(mpOutput);
				}
			}
		}
	}
	return true;
}		/* -----  end of method DataProcessor::HighIsoforms----- */

