/*
 * =====================================================================================
 *
 *       Filename:  DataProcessor.cc
 *
 *    Description:  The implementation of class DataProcessor
 *
 *        Version:  1.0
 *        Created:  04/17/2009 04:38:40 PM
 *       Revision:  none
 *       Compiler:  gcc
 *
 *         Author:  Jianxing Feng (feeldead), feeldead@gmail.com
 *        Company:  THU
 *
 * =====================================================================================
 */

#include "DataProcessor.h"
#include "InstanceWriter.h"
#include "InstanceReader.h"
#include <cassert>
#include <algorithm>
#include <string.h>
#include "ReadInfoSE.hpp"
#include "ReadInfoPE.hpp"

//#define DEBUG0
//#define DEBUG1

/*
 *--------------------------------------------------------------------------------------
 *       Class:  DataProcessor
 *      Method:  DataProcessor
 * Description:  constructor
 *--------------------------------------------------------------------------------------
 */
DataProcessor::DataProcessor ()
{
}  /* -----  end of method DataProcessor::DataProcessor  (constructor)  ----- */

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  ExtractExon
 *  Description:  Extract all the exons from the file. Overlaped exons will be divided.
 *        Param:  from_file :  The format of the file is the format of knownGene table 
 *                             from UCSC genome browser
 *                exons[out]:  A map from the chromosome to Exon. This array is sorted.
 *       Return:
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::ExtractExon(string from_file, map_str2vec_exon& exons, int read_len)
{
	fstream infile;
	infile.open(from_file.data(), ios::in);
	if (!infile.is_open())
	{
		cerr << "File " << from_file.data() << " can not be opened" << endl;
		return false;
	}

	map_str2vec_int bars;

	// read the data and fill the vector
	int org_total = 0;
	vector<string> splitted;
	int line_cnt = 0;
	string line;
	while (getline(infile, line))
	{
		splitted = Utility::Split('\t', line);

		string chr;
		// chr is the combination of chromosome and strand
		if (mbStrandSpecific)
			chr = splitted[1]+splitted[2];
		else
			chr = splitted[1];
		bool strand = (splitted[2] == "+");
		int exon_cnt = atoi(splitted[7].data());
		vector<string> starts = Utility::Split(',', splitted[8]);
		vector<string> ends = Utility::Split(',', splitted[9]);

		if (starts.size() != ends.size())
			cerr << "DATA ERROR, line " << line_cnt << " : Starts cnt != Ends cnt" << endl;
		if (exon_cnt != starts.size())
		{
			cerr << exon_cnt << "," << starts.size() -1 << endl;
			cerr << "DATA ERROR, line " << line_cnt << " : Exon cnt != Starts cnt" << endl;
		}

		// Fill bars
		if (bars.find(chr) == bars.end())
		{
			vector<int> av;
			bars[chr] = av;
		}

		vector<int>& av = bars[chr];
		for (int i = 0; i < exon_cnt; i++)
		{
			av.push_back(atoi(starts[i].data()));
			av.push_back(atoi(ends[i].data()));
		}

		// Fill exons
		if (exons.find(chr) == exons.end())
		{
			vector<Exon> avf;
			exons[chr] = avf;
		}
		vector<Exon>& avf = exons[chr];

		for (int i = 0; i < exon_cnt; i++)
		{
			Exon exon;
			exon.mStart = atoi(starts[i].data());
			exon.mEnd = atoi(ends[i].data());
			exon.mChr = chr;
			exon.mStrand = strand;
			avf.push_back(exon);
			org_total++;
		}

		line_cnt++;
	}

	// On each chromosome, partition the exons
	for_each_ele_in_group(iter, map_str2vec_int, bars)
	{
		vector<Exon>& curr_exons = exons[iter->first];
		vector<int>& curr_bars = iter->second;

		// Sort the bars and remove dups
		sort(curr_bars.begin(), curr_bars.end());
		int cnt = 0;
		for (int i = 1; i < curr_bars.size(); i++)
		{
			if (curr_bars[i] != curr_bars[cnt])
				/* :BUG:02/04/2009 09:05:47 PM:feeldead: cnt++ should be ++cnt */
				//curr_bars[cnt++] = curr_bars[i];  
				curr_bars[++cnt] = curr_bars[i];  
		}
		cnt++;
		curr_bars.resize(cnt);

		vector<Exon> temp_exons;

		// Partition each exon on this chromosome
		for (int i = 0; i < curr_exons.size(); i++)
		{
			// binary search the start position of this exon in curr_bars
			int idx = UtilityTemp<int>::BinarySearch(curr_bars, curr_exons[i].mStart);
			if (curr_bars[idx] != curr_exons[i].mStart)
				cerr << "ERROR : curr_bars[idx] != curr_exons[i].mStart" << endl;

			int end = curr_exons[i].mEnd;
			curr_exons[i].mEnd = curr_bars[++idx];
			while (end > curr_bars[idx])
			{
				Exon exon;
				exon.mStart = curr_bars[idx];
				exon.mEnd = curr_bars[idx+1];
				exon.mChr = curr_exons[i].mChr;
				exon.mStrand = curr_exons[i].mStrand;
				temp_exons.push_back(exon);
				idx++;
			}
		}

		for (int i = 0; i < temp_exons.size(); i++)
			curr_exons.push_back(temp_exons[i]);

		// Sort the exons and remove dups
		sort(curr_exons.begin(), curr_exons.end());
		cnt = 0;
		for (int i = 1; i < curr_exons.size(); i++)
		{
			if (curr_exons[i] != curr_exons[cnt])
				curr_exons[++cnt] = curr_exons[i];  
		}
		cnt++;
		curr_exons.resize(cnt);
	}

	int total = 0;
	int cnt_short_read = 0;
	for_each_ele_in_group(iter, map_str2vec_exon, exons)
	{
		vector<Exon>& curr_exons = exons[iter->first];
		total += curr_exons.size();
		for (int i = 0; i < curr_exons.size(); i++)
		{
			if (curr_exons[i].mEnd - curr_exons[i].mStart < read_len)
				cnt_short_read++;
		}
	}

	cout << "Ori : Part : Short = " << org_total << " : " << total << " : " << cnt_short_read << endl;
	return true;
}		/* -----  end of method DataProcessor::ExtractExon  ----- */

/*
 *--------------------------------------------------------------------------------------
 *       Class:  DataProcessor
 *      Method:  LoadNonJunctionReads 
 * Description:  Specify the positions and length, this method extract the sequences
 *       Param:  start_pos_of_first_nt : The position of the first neocliotide on a chromosome.
 *      Return:
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::ExtractRef(string& refseq_file, map<string, map<int, string> >& pos2refseq, int cross_len, int start_pos_of_first_nt)
{
	// Scan the refseq, extract the required segments 
	fstream infile;
	infile.open(refseq_file.data(), ios::in);
	if (!infile.is_open())
	{
		cerr << "File " << refseq_file.data() << " can not be opened" << endl;
		return false;
	}

	int nt_cnt = start_pos_of_first_nt;
	string line;
	char buf[cross_len];
	char buf_seq[cross_len+1];
	buf_seq[cross_len] = 0;
	int buf_pointer = 0;
	string curr_chr;
	map<int, string>* p_map_pos = 0;
	map<int, string>* p_map_neg = 0;
	set<string> exist_chr;
	while (getline(infile, line))
	{
		if (line[0] == '>')
		{
			if (nt_cnt != start_pos_of_first_nt)
				cout << "Length of chromosome " << curr_chr << " is " << nt_cnt << endl;
			nt_cnt = start_pos_of_first_nt;
			buf_pointer = 0;
			vector<char> delimits;
			delimits.push_back('>');
			delimits.push_back('|');
			vector<string> splitted = Utility::Split(delimits, line);
			curr_chr = splitted[1];

			p_map_pos = 0;
			p_map_neg = 0;
			if (mbStrandSpecific)
			{
				string chr = curr_chr + "+";
				if (pos2refseq.find(chr) != pos2refseq.end())
					p_map_pos = &pos2refseq[chr];
				chr = curr_chr + "-";
				if (pos2refseq.find(chr) != pos2refseq.end())
					p_map_neg = &pos2refseq[chr];
			}
			else
			{
				if (pos2refseq.find(curr_chr) != pos2refseq.end())
					p_map_pos = &pos2refseq[curr_chr];
			}
			exist_chr.insert(curr_chr);
			continue;
		}

		const char* cont = line.data();
		// fill the buffer
		for (int c = 0; c < line.length(); c++)
		{
			buf[buf_pointer] = cont[c];	

			// Note, +1 here
			int pos = nt_cnt-cross_len+1;  
			if (p_map_pos && p_map_pos->find(pos) != p_map_pos->end())
			{
				int idx = buf_pointer;
				for (int i = cross_len - 1; i >= 0; i--)
				{
					buf_seq[i] = buf[idx];
					idx = (idx + cross_len - 1) % cross_len;
				}
				(*p_map_pos)[pos] = buf_seq;
			}
			if (p_map_neg && p_map_neg->find(pos) != p_map_neg->end())
			{
				int idx = buf_pointer;
				for (int i = cross_len - 1; i >= 0; i--)
				{
					buf_seq[i] = buf[idx];
					idx = (idx + cross_len - 1) % cross_len;
				}
				(*p_map_neg)[pos] = buf_seq;
			}
			buf_pointer = (buf_pointer + cross_len + 1) % (cross_len);
			nt_cnt++;
		}
	}

	// For the tail
	for (int k = 1; k < cross_len; k++)
	{
		int pos = nt_cnt - k;

		if (p_map_pos && p_map_pos->find(pos) != p_map_pos->end())
		{
			int idx = (buf_pointer + cross_len - 1) % cross_len;
			for (int i = k-1; i >= 0; i--)
			{
				buf_seq[i] = buf[idx];
				idx = (idx + cross_len - 1) % cross_len;
			}
			buf_seq[k] = 0;
			(*p_map_pos)[pos] = buf_seq;
		}
		if (p_map_neg && p_map_neg->find(pos) != p_map_neg->end())
		{
			int idx = (buf_pointer + cross_len - 1) % cross_len;
			for (int i = k-1; i >= 0; i--)
			{
				buf_seq[i] = buf[idx];
				idx = (idx + cross_len - 1) % cross_len;
			}
			buf_seq[k] = 0;
			(*p_map_neg)[pos] = buf_seq;
		}
	}

	infile.close();

	return true;
}		/* -----  end of method DataProcessor::ExtractRef  ----- */

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  CreateReadInfo
 *  Description:  Load read information.
 *        Param:  read_info_file  [i] : See -read_info for the format of this file. 
 *                read_infos      [o] : The pointer in this array will be newed. They
 *                                      should be deleted outside.
 *       Return:  
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::CreateReadInfo(string read_info_file, 
						  vector<ReadInfoBase*>& read_infos,
						  vector<string>& mapping_files)
{
	fstream infile;
	infile.open(read_info_file.data(), ios::in);
	if (!infile.is_open())
	{
		cerr << "File " << read_info_file.data() << " can not be opened" << endl;
		return false;
	}

	// read the data and fill the vector
	vector<string> splitted;
	string line;
	while (getline(infile, line))
	{
		splitted = Utility::Split(' ', line);
		mapping_files.push_back(splitted[0]);
		int read_type = atoi(splitted[1].data());

		ReadInfoBase* p_read_info;
		if (0 == read_type)
			p_read_info = new ReadInfoSE;
		else if (1 == read_type)
			p_read_info = new ReadInfoPE;
		else 
		{
			cerr << "Error : Unrecognized short read type" << endl;
			exit(0);
		}
		p_read_info->Read(&infile);
		infile.ignore(99999, '\n');
		read_infos.push_back(p_read_info);
	}
	infile.close();
	return true;
}

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  WriteReadInfo
 *  Description:  Write read information to a file
 *        Param:  read_info_file  [i] : See -read_info for the format of this file. 
 *                read_infos      [o] : The pointer in this array will be newed. They
 *                                      should be deleted outside.
 *       Return:  
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::WriteReadInfo(string read_info_file, 
						  vector<ReadInfoBase*>& read_infos,
						  vector<string>& mapping_files)
{
	fstream outfile;
	outfile.open(read_info_file.data(), ios::out);
	if (!outfile.is_open())
	{
		cerr << "File " << read_info_file.data() << " can not be opened" << endl;
		return false;
	}

	for (unsigned i = 0; i < read_infos.size(); ++i)
	{
		outfile << mapping_files[i] << " ";
		read_infos[i]->Write(&outfile);
		outfile << endl;
	}
	
	outfile.close();
	return true;
}
/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  GetInvolvedExons
 *  Description:  
 *        Param:  
 *       Return:  
 *--------------------------------------------------------------------------------------
 */
int
DataProcessor::GetInvolvedExons(string& a_line, int line_cnt, map<string, vector<Exon> >& exons, 
								map_str2vec_int& exon_2_instance, map_str2vec_int& exon_2_local_idx,
								vector<int>& involved_exons)
{
	vector<string> splitted = Utility::Split('\t', a_line);

	string chr;
	if (mbStrandSpecific)
		chr = splitted[0]+splitted[1];
	else
		chr = splitted[0];

	int gene_idx = -1;
	if (exons.find(chr) == exons.end())
	{
#ifdef DEBUG1
		cerr << "WARNING : line " << line_cnt << ". No information of chromosome" << chr << " exists." << endl;
		return gene_idx;
#endif
	}

	vector<Exon>& exons_on_curr_chr = exons[chr];
	vector<int>& curr_exon_2_instance = exon_2_instance[chr];
	vector<int>& curr_local_idx = exon_2_local_idx[chr];

	vector<string> starts = Utility::Split(',', splitted[2]);
	vector<string> ends = Utility::Split(',', splitted[3]);

	int prev_exon_idx = -1;
	for (unsigned i = 0; i < starts.size(); ++i)
	{
		int start_exon_idx = FindExon(exons_on_curr_chr, atoi(starts[i].data()));
		int end_exon_idx = FindExon(exons_on_curr_chr, atoi(ends[i].data())-1);
		if (-1 == start_exon_idx)
		{
#ifdef DEBUG1
			cerr << "WARNING : line " << line_cnt << ". " << starts[i] << " does not appear in any exon. Discarded" << endl;
#endif
			break;
		}
		if (-1 == end_exon_idx)
		{
#ifdef DEBUG1
			cerr << "WARNING : line " << line_cnt << ". " << ends[i] << " does not appear in any exon. Discarded" << endl;
#endif
			break;
		}


		if (-1 == curr_exon_2_instance[start_exon_idx])
		{
#ifdef DEBUG1
			cerr << "WARNING : line " << line_cnt << ". " << starts[i] << " does not appear in any instance. Discarded" << endl;
#endif
			gene_idx = -2;
			break;
		}
		if (-1 == curr_exon_2_instance[end_exon_idx])
		{
#ifdef DEBUG1
			cerr << "WARNING : line " << line_cnt << ". " << ends[i] << " does not appear in any instance. Discarded" << endl;
#endif
			gene_idx = -2;
			break;
		}
		if (curr_exon_2_instance[start_exon_idx] != curr_exon_2_instance[end_exon_idx])
		{
#ifdef DEBUG1
			cerr << "WARNING : line " << line_cnt << ". The read has been mapped to more than two instances. Discarded" << endl;
#endif
			gene_idx = -2;
			break;
		}

		if (-1 == gene_idx)
			gene_idx = curr_exon_2_instance[start_exon_idx];
		else if (curr_exon_2_instance[start_exon_idx] != gene_idx)
		{
			gene_idx = -1;
			break;
		}

		prev_exon_idx = start_exon_idx;
		for (int j = start_exon_idx; j <= end_exon_idx; j++)
			involved_exons.push_back(curr_local_idx[j]);
	}

	// Remove duplications in the involved exons and sort it
	UtilityTemp<int>::RemoveDups(involved_exons);

	if (-1 == gene_idx)
	{
#ifdef DEBUG1
		cerr << "WARNING : line " << line_cnt << ". The exons do not belong to the same instance. This read is discarded" << endl;
#endif
	}

	return gene_idx;
}		/* -----  end of method DataProcessor::GetInvolvedExons  ----- */

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  LoadShortReads
 *  Description:  Load short reads from a file
 *        Param:  
 *       Return:  
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::LoadShortReads(string from_file, map<string, vector<Exon> >& exons, 
		ReadInfoBase* p_read_info, vector<Instance>& all_instances)
{
	fstream infile;
	infile.open(from_file.data(), ios::in);
	if (!infile.is_open())
	{
		cerr << "File " << from_file.data() << " can not be opened" << endl;
		return false;
	}

	cout << "Loading short reads from " << from_file << endl;

	// For every exon, find the instance to which it belongs
	// Setup the mapping from exon ID to the index of exons on every instance
	map_str2vec_int exon_2_instance;
	map_str2vec_int exon_2_local_idx;
	for_each_ele_in_group(iter, map_str2vec_exon, exons)
	{
		string curr_chr = iter->first;
		vector<int> null_vec;
		exon_2_instance[curr_chr] = null_vec;
		exon_2_local_idx[curr_chr] = null_vec;
		vector<Exon>& curr_exons = iter->second;
		exon_2_instance[curr_chr].assign(curr_exons.size(), -1);
		exon_2_local_idx[curr_chr].assign(curr_exons.size(), -1);
	}

	for (unsigned ins = 0; ins < all_instances.size(); ins++)
	{
		Instance& an_instance = all_instances[ins];

		string curr_chr = an_instance.mExons[0].mChr;
		vector<Exon>& exons_on_curr_chr = exons[curr_chr];
		vector<int>& curr_exon_2_instance = exon_2_instance[curr_chr];
		vector<int>& curr_local_idx = exon_2_local_idx[curr_chr];

		for (unsigned i = 0; i < an_instance.mSegLen.size(); i++)
		{
			int start = an_instance.mExons[i].mStart;
			int exon_idx = FindExon(exons_on_curr_chr, start);
			curr_exon_2_instance[exon_idx] = ins;
			curr_local_idx[exon_idx] = i;
		}

		vector<ShortRead>& short_reads = an_instance.mShortReadGroup.mShortReads;

		short_reads.resize(short_reads.size()+1);
		short_reads[short_reads.size()-1].mpReadInfo = p_read_info;
	}
	
	// read the data and fill the vector
	vector<string> splitted;
	int read_cnt = 0;
	int line_cnt = 0;
	string line;
	while (getline(infile, line))
	{
		++line_cnt;
	
		Pattern pattern;
		vector<int> first_involved_exons;
		int first_ins_idx = GetInvolvedExons(line, line_cnt, exons, exon_2_instance, exon_2_local_idx, first_involved_exons);
		pattern.mMappedSegs.push_back(first_involved_exons);

		vector<int> second_involved_exons;
		int second_ins_idx = first_ins_idx;
		if (!p_read_info->IsSingleEnd())
		{
			++line_cnt;
			getline(infile, line);
			second_ins_idx = GetInvolvedExons(line, line_cnt, exons, exon_2_instance, exon_2_local_idx, second_involved_exons);
			pattern.mMappedSegs.push_back(second_involved_exons);
		}

		if (second_ins_idx != first_ins_idx)
		{
#ifdef DEBUG0
			cerr << "WARNING : line " << line_cnt << ". The paired ends do not belong to the same instance" << endl;
			continue;
#endif
		}
		if (first_involved_exons.size() > 0 && (p_read_info->IsSingleEnd() || second_involved_exons.size() > 0) &&
			first_ins_idx >= 0 && second_ins_idx == first_ins_idx)
		{
			Instance& an_instance = all_instances[first_ins_idx];

			ShortRead& short_read = an_instance.mShortReadGroup.mShortReads[an_instance.mShortReadGroup.mShortReads.size()-1];

			bool b_exist = false;
			for (unsigned i = 0; i < short_read.mPatterns.size(); ++i)
			{
				if (pattern == short_read.mPatterns[i])
				{
					b_exist = true;
					short_read.mPatternDup[i]++;
					break;
				}
			}
			if (!b_exist)
			{
				short_read.mPatterns.push_back(pattern);
				short_read.mPatternDup.push_back(1);
			}

			read_cnt++;
		}

		if (line_cnt % 100000 == 0)
			cout << line_cnt << " lines have been scaned." << "   "  
				 << read_cnt << " reads have been loaded successfully." << endl;
	}

	cout << line_cnt << " lines have been scaned." << "   "  
		 << read_cnt << " reads have been loaded successfully." << endl;

	p_read_info->mTotalReadCnt = read_cnt;

	infile.close();

	return true;
}		/* -----  end of method DataProcessor::LoadShortReads----- */



/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  Decompose
 *  Description:  Given a interval, decompose it into exons. This interval is supposed
 *                start from the start position of some exon and end at the end position
 *                of some exon.
 *        Param:  exons_idx[o]  :  Stores the index of decomposed exons.
 *       Return:
 *--------------------------------------------------------------------------------------
 */
void
DataProcessor::Decompose(vector<Exon>& exons, int start, int end, vector<int>& exons_idx)
{
	// Find the exon with start position == start
	
	// binary search
	int low = 0;
	int high = exons.size() - 1;
	int mid = -1;

	while (low <= high)
	{
		mid = (low + high) / 2;

		if (exons[mid].mStart < start)
			low = mid + 1;
		else if (exons[mid].mStart > start)
			high = mid - 1;
		else if (exons[mid].mStart == start)
			break;
	}

	if (exons[mid].mStart != start)
	{
		cerr << __func__ << " ERROR : exons[mid].mStart != start" << endl;
		exit(1);
	}

	while (mid < exons.size())
	{
		exons_idx.push_back(mid);
		if (exons[mid].mEnd < end)
			mid++;
		else 
			break;
	}
	if (exons[mid].mEnd != end)
	{
		cerr << __func__ << " ERROR : exons[mid].mEnd != end" << endl;
		exit(1);
	}
	return ;
}		/* -----  end of method DataProcessor::Decompose  ----- */

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  FindExon
 *  Description:  Given a position, find the exon that contains this position, if no such
 *                exon exists, the one with greatest start position which is less than
 *                the given position is returned
 *        Param:  pos
 *       Return:  
 *         Note:  Make sure that exons has been sorted according to their start position.
 *--------------------------------------------------------------------------------------
 */
int
DataProcessor::FindExon(vector<Exon>& exons, int64 pos)
{
	// binary search
	int low = 0;
	int high = exons.size() - 1;
	int mid = -1;

	while (low <= high)
	{
		mid = (low + high) / 2;
		if (pos < exons[mid].mStart)
			high = mid - 1;
		else if (pos > exons[mid].mStart)
			low = mid + 1;
		else if (pos == exons[mid].mStart)
		{
			high = mid;
			break;
		}
	}

	return high;
}		/* -----  end of method DataProcessor::FindExon  ----- */


/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  LoadGenes
 *  Description:  Load genes from a file
 *        Param:  gene_file_name     :  The format of this file is :
 *   	              name chromosome  strand  start_position end_position exon_start_positions, exon_end_positions
 *   	          genes  :  genes should be sorted by the start positions.
 *       Return:
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::LoadGenes(string gene_file_name, map_str2vec_gene& genes)
{
	fstream infile;
	infile.open(gene_file_name.data(), ios::in);
	if (!infile.is_open())
	{
		cerr << "File " << gene_file_name.data() << " can not be opened" << endl;
		return false;
	}
	cout << "Loading genes" << endl;

	vector<string> splitted;
	int line_cnt = 0;
	string line;
	set<string> gene_names;
	while (getline(infile, line))
	{
		line_cnt++;
		splitted = Utility::Split('\t', line);

		string name = splitted[0];
		string chr = splitted[1];

		if (gene_names.find(name) != gene_names.end())
		{
			cerr << "WARNING : gene " << name << " has appeared before. Line " << line_cnt << endl;
			continue;
		}
		gene_names.insert(name);

		if (mbStrandSpecific)
			chr += splitted[2];

		vector<string> starts;
		vector<string> ends;

		if (splitted.size() >= 7)
		{
			starts = Utility::Split(',', splitted[5]);
			ends = Utility::Split(',', splitted[6]);
		}

		if (ends.size() != starts.size())
		{
			cerr << "ERROR, line " << line_cnt << " : Exon starts cnt != ends cnt" << endl;
			continue;
		}

		if (genes.find(chr) == genes.end())
		{
			vector<Gene> empty_vec;
			genes[chr] = empty_vec;
		}

		Gene a_gene;
		a_gene.mName = name;
		a_gene.mChr = chr;
		a_gene.mStrand = (splitted[2] == "+");
		a_gene.mStart = atoi(splitted[3].data());
		a_gene.mEnd = atoi(splitted[4].data());

		vector<Exon>& exons = a_gene.mExons;
		exons.resize(starts.size());
		for (int i = 0; i < starts.size(); i++)
		{
			exons[i].mStart = atoi(starts[i].data());
			exons[i].mEnd = atoi(ends[i].data());
		}

		genes[chr].push_back(a_gene);
	}

	infile.close();
	cout << "Loading genes done. " << gene_names.size() << " genes have been loaded." << endl;

	for_each_ele_in_group(iter, map_str2vec_gene, genes)
		sort(iter->second.begin(), iter->second.end());

	return true;
}		/* -----  end of method DataProcessor::LoadGenes ----- */

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  LoadBoundary
 *  Description:  Load exon-intron boundaries from a file
 *        Param:  boundary_file_name :  The format of the file is :
 *                    chromosome  strand  position type
 *                    The position is always the position of the first base 
 *                    of an exon or intron. 
 *       Return:
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::LoadBoundary(string boundary_file_name, map_str2vec_boundary& boundaries)
{
	fstream infile;
	infile.open(boundary_file_name.data(), ios::in);
	if (!infile.is_open())
	{
		cerr << "File " << boundary_file_name.data() << " can not be opened" << endl;
		return false;
	}

	map_str2map_int642int bars;

	cout << "Loading boundaries" << endl;
	vector<string> splitted;
	int line_cnt = 0;
	string line;
	while (getline(infile, line))
	{
		line_cnt++;
		splitted = Utility::Split('\t', line);

		string chr = splitted[0];
		string strand = splitted[1];
		if (mbStrandSpecific)
			chr += strand;

		int64 position = atoi(splitted[2].data());
		int type = atoi(splitted[3].data());
		if (type != 0 && type != 1)
		{
			cerr << "WARNING : on line " << line_cnt << ", the type of boundary is not 0 or 1" << endl;
			continue;
		}

		// Fill bars
		if (bars.find(chr) == bars.end())
		{
			map<int64,int> empty_set;
			bars[chr] = empty_set;
		}

		map<int64,int>& curr_bounds = bars[chr];

		if (curr_bounds.find(position) == curr_bounds.end())
			curr_bounds[position] = type;
		else
		{
			if (curr_bounds[position] != type)
				curr_bounds[position] = 2;
		}
	}

	infile.close();

	int bound_cnt = 0;
	for_each_ele_in_group(iter, map_str2map_int642int, bars)
	{
		bound_cnt += iter->second.size();
		vector<Boundary> empty_vec;
		boundaries[iter->first] = empty_vec;

		vector<Boundary>& curr_bounds = boundaries[iter->first];
		curr_bounds.resize(iter->second.size());

		int i = 0;
		for_each_ele_in_group(iter2, map_int642int, iter->second)
		{
			Boundary& bound = curr_bounds[i++];
			bound.mPos = iter2->first;
			bound.mType = iter2->second;
		}

		sort(curr_bounds.begin(), curr_bounds.end());
	}

	cout << "Loading boundaries done. " << bound_cnt << " boundaries have been loaded." << endl;
		
	return true;
}		/* -----  end of method DataProcessor::LoadBoundary ----- */

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  LoadTSSPAS
 *  Description:  Load TSSs and PASs from a file
 *        Param:  tss_pas_file_name :  The format of the file is :
 *                    gene_name   TSSs   PASs
 *                    TSSs or PASs are sepereted by comma. In each line, an isoform
 *                    starting from one element in TSSs must end with some element in
 *                    PASs. There is no constraint on different lines.
 *       Return:
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::LoadTSSPAS(string tss_pas_file_name, map_str2vec_gene& genes)
{
	fstream infile;
	infile.open(tss_pas_file_name.data(), ios::in);
	if (!infile.is_open())
	{
		cerr << "File " << tss_pas_file_name.data() << " can not be opened" << endl;
		return false;
	}

	map<string, vector<vector<int64> > >  TSSs;
	map<string, vector<vector<int64> > >  PASs;

	map_str2int gene_starts;
	map_str2int gene_ends;

	for_each_ele_in_group(iter, map_str2vec_gene, genes)
	{
		vector<Gene>& av = iter->second;
		for (unsigned i = 0; i < av.size(); i++)
		{
			gene_starts[av[i].mName] = av[i].mStart;
			gene_ends[av[i].mName] = av[i].mEnd;
		}
	}

	cout << "Loading TSSs and PASs" << endl;
	vector<string> splitted;
	int line_cnt = 0;
	string line;
	while (getline(infile, line))
	{
		line_cnt++;
		splitted = Utility::Split('\t', line);

		string& gene_name = splitted[0];

		if (gene_starts.find(gene_name) == gene_starts.end())
		{
			cerr << "WARNING : " << gene_name << " in line " << line_cnt << " does not appear in gene file" << endl;
			continue;
		}

		vector<string> tss = Utility::Split(',', splitted[1]);
		vector<string> pas = Utility::Split(',', splitted[2]);

		vector<int64> a_group_tss;
		for (unsigned i = 0; i < tss.size(); i++)
		{
			int pos = atoi(tss[i].data());

			if (gene_starts[gene_name] > pos || gene_ends[gene_name] <= pos)
			{
				cerr << "WARNING : the position " << pos << " in line " << line_cnt 
					 << " is out of the gene boundary" << endl;
			}
			else
				a_group_tss.push_back(pos);
		}

		vector<int64> a_group_pas;
		for (unsigned i = 0; i < pas.size(); i++)
		{
			int pos = atoi(pas[i].data());

			if (gene_starts[gene_name] >= pos || gene_ends[gene_name] < pos)
			{
				cerr << "WARNING : the position " << pos << " in line " << line_cnt 
					 << " is out of the gene boundary" << endl;
			}
			else
				a_group_pas.push_back(pos);
		}

		if (TSSs.find(gene_name) == TSSs.end())
		{
			vector<vector<int64> > empty_vec;
			TSSs[gene_name] = empty_vec;
			PASs[gene_name] = empty_vec;
		}

		if (a_group_tss.size() == 0)
		{
			cerr << "WARNING : no TSS is found in line " << line_cnt << endl;
			continue;
		}
		if (a_group_pas.size() == 0)
		{
			cerr << "WARNING : no PAS is found in line " << line_cnt << endl;
			continue;
		}

		TSSs[gene_name].push_back(a_group_tss);
		PASs[gene_name].push_back(a_group_pas);
	}

	infile.close();
	cout << "Loading TSSs and PASs done. " << endl;

	for_each_ele_in_group(iter, map_str2vec_gene, genes)
	{
		vector<Gene>& av = iter->second;
		for (unsigned i = 0; i < av.size(); i++)
		{
			string& name = av[i].mName;
			av[i].mTSSs = TSSs[name];
			av[i].mPASs = PASs[name];
		}
	}

	return true;
}		/* -----  end of method DataProcessor::LoadTSSPAS ----- */


/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  LoadGeneAndExon
 *  Description:  Load exon-intron boundaries from a file
 *        Param:  The TSSPAS and boundaries are extracted from gene_file_name.
 *       Return:
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::LoadGeneAndExon(string gene_file_name, map_str2vec_gene& genes, map_str2vec_exon& exons)
{
	if (!LoadGenes(gene_file_name, genes)) return false;

	map_str2vec_boundary boundaries;

	// Extract all the boundaries including TSSs and PASs on each chromosome
	cout << "Merge boundaries with TSSs and PASs" << endl;
	for_each_ele_in_group(iter, map_str2vec_gene, genes)
	{
		const string& chr = iter->first;
		vector<Gene>& curr_genes = iter->second;

		if (boundaries.find(chr) == boundaries.end())
		{
			vector<Boundary> empty_vec;
			boundaries[chr] = empty_vec;
		}

		map_int642int bar_set;

		vector<Boundary>& curr_bounds = boundaries[chr];

		for (unsigned i = 0; i < curr_genes.size(); i++)
		{
			Gene& a_gene = curr_genes[i];

			for (unsigned j = 0; j < a_gene.mExons.size(); j++)
			{
				int64 pos = a_gene.mExons[j].mStart;
				int type = 0;
				if (bar_set.find(pos) == bar_set.end())
					bar_set[pos] = type;
				else
				{
					if (bar_set[pos] != type)
						bar_set[pos] = 2;
				}

				pos = a_gene.mExons[j].mEnd;
				type = 1;
				if (bar_set.find(pos) == bar_set.end())
					bar_set[pos] = type;
				else
				{
					if (bar_set[pos] != type)
						bar_set[pos] = 2;
				}
			}
			for (unsigned j = 0; j < a_gene.mTSSs.size(); j++)
			{
				for (unsigned k = 0; k < a_gene.mTSSs[j].size(); k++)
				{
					int64 pos = a_gene.mTSSs[j][k];
					int type = 0;
					if (bar_set.find(pos) == bar_set.end())
						bar_set[pos] = type;
					else
					{
						if (bar_set[pos] != type)
							bar_set[pos] = 2;
					}
				}
				for (unsigned k = 0; k < a_gene.mPASs[j].size(); k++)
				{
					int64 pos = a_gene.mPASs[j][k];
					int type = 1;
					if (bar_set.find(pos) == bar_set.end())
						bar_set[pos] = type;
					else
					{
						if (bar_set[pos] != type)
							bar_set[pos] = 2;
					}
				}
			}
		}

		curr_bounds.resize(bar_set.size());
		int i = 0;
		for_each_ele_in_group(iter, map_int642int, bar_set)
		{
			Boundary& bound = curr_bounds[i++];
			bound.mPos = iter->first;
			bound.mType = iter->second;
		}

		sort(curr_bounds.begin(), curr_bounds.end());
	}

	cout << "Build all expressed segments" << endl;
	int cnt = 0;
	for_each_ele_in_group(iter, map_str2vec_boundary, boundaries)
	{
		string chr = iter->first;
		bool strand = true;
		if (mbStrandSpecific)
			strand = (chr.substr(chr.length()-1, chr.length()) == "+");

		vector<Boundary>& curr_bounds = iter->second;

		vector<Exon> empty_vec;
		exons[chr] = empty_vec;

		vector<Exon>& curr_exons = exons[chr];
		curr_exons.resize(curr_bounds.size()-1);

		for (unsigned i = 1; i < curr_bounds.size(); i++)
		{
			// Note that every segment could be a part of an exon
			Exon& exon = curr_exons[i-1];
			exon.mStart = curr_bounds[i-1].mPos;
			exon.mEnd = curr_bounds[i].mPos;
			exon.mStartType = curr_bounds[i-1].mType;
			exon.mEndType = curr_bounds[i].mType;
			exon.mChr = chr;
			exon.mStrand = strand;
		}
		cnt += curr_exons.size();
	}
	cout << cnt << " expressed segments have been build" << endl;

	return true;
}		/* -----  end of method DataProcessor::LoadGeneAndExon  ----- */



/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  LoadGeneAndExon
 *  Description:  Load exon-intron boundaries from a file
 *        Param:  noise_level  :  In RPKM. Segments with expression levels below this
 *                    are considered as exons. 
 *       Return:
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::LoadGeneAndExon(string boundary_file_name, string gene_file_name, 
							   string tss_pas_file_name, map_str2vec_gene& genes, map_str2vec_exon& exons)
{
	if (!LoadGenes(gene_file_name, genes)) return false;
	if (!LoadTSSPAS(tss_pas_file_name, genes)) return false;

	map_str2vec_boundary boundaries;
	if (!LoadBoundary(boundary_file_name, boundaries)) return false;

	map_str2set_int all_tss;
	map_str2set_int all_pas;

	// Extract all the boundaries including TSSs and PASs on each chromosome
	cout << "Merge boundaries with TSSs and PASs" << endl;
	for_each_ele_in_group(iter, map_str2vec_gene, genes)
	{
		const string& chr = iter->first;
		vector<Gene>& curr_genes = iter->second;

		if (all_tss.find(chr) == all_tss.end())
		{
			set<int> empty_set;
			all_tss[chr] = empty_set;
			all_pas[chr] = empty_set;
		}
		if (boundaries.find(chr) == boundaries.end())
		{
			vector<Boundary> empty_vec;
			boundaries[chr] = empty_vec;
		}

		map_int642int bar_set;

		vector<Boundary>& curr_bounds = boundaries[chr];
		for (unsigned i = 0; i < curr_bounds.size(); i++)
			bar_set[curr_bounds[i].mPos] = curr_bounds[i].mType;

		for (unsigned i = 0; i < curr_genes.size(); i++)
		{
			Gene& a_gene = curr_genes[i];

			for (unsigned j = 0; j < a_gene.mTSSs.size(); j++)
			{
				for (unsigned k = 0; k < a_gene.mTSSs[j].size(); k++)
				{
					int64 pos = a_gene.mTSSs[j][k];
					int type = 0;
					if (bar_set.find(pos) == bar_set.end())
						bar_set[pos] = type;
					else
					{
						if (bar_set[pos] != type)
							bar_set[pos] = 2;
					}
				}
				for (unsigned k = 0; k < a_gene.mPASs[j].size(); k++)
				{
					int64 pos = a_gene.mPASs[j][k];
					int type = 1;
					if (bar_set.find(pos) == bar_set.end())
						bar_set[pos] = type;
					else
					{
						if (bar_set[pos] != type)
							bar_set[pos] = 2;
					}
				}
			}
		}

		curr_bounds.resize(bar_set.size());
		int i = 0;
		for_each_ele_in_group(iter, map_int642int, bar_set)
		{
			Boundary& bound = curr_bounds[i++];
			bound.mPos = iter->first;
			bound.mType = iter->second;
		}

		sort(curr_bounds.begin(), curr_bounds.end());
	}

	cout << "Build all expressed segments" << endl;
	int cnt = 0;
	for_each_ele_in_group(iter, map_str2vec_boundary, boundaries)
	{
		string chr = iter->first;
		bool strand = true;
		if (mbStrandSpecific)
			strand = (chr.substr(chr.length()-1, chr.length()) == "+");

		vector<Boundary>& curr_bounds = iter->second;

		vector<Exon> empty_vec;
		exons[chr] = empty_vec;

		vector<Exon>& curr_exons = exons[chr];
		curr_exons.resize(curr_bounds.size()-1);

		for (unsigned i = 1; i < curr_bounds.size(); i++)
		{
			// Note that every segment could be a part of an exon
			Exon& exon = curr_exons[i-1];
			exon.mStart = curr_bounds[i-1].mPos;
			exon.mEnd = curr_bounds[i].mPos;
			exon.mStartType = curr_bounds[i-1].mType;
			exon.mEndType = curr_bounds[i].mType;
			exon.mChr = chr;
			exon.mStrand = strand;
		}
		cnt += curr_exons.size();
	}
	cout << cnt << " expressed segments have been build" << endl;

	return true;
}		/* -----  end of method DataProcessor::LoadGeneAndExon  ----- */

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor
 *       Method:  GroupGeneAndExon
 *  Description:  Given exons and the boundaries of genes, group genes and exons such 
 *                that if two gene share one exon, the two gene have the same color. 
 *                Exons that belong to genes who have the same color will be assigned
 *                the same color.
 *        Param:  The genes and exons should be sorted according to their start positions
 *                on each chromosome.
 *       Return:
 *--------------------------------------------------------------------------------------
 */
void
DataProcessor::GroupGeneAndExon(map_str2vec_gene& genes, map_str2vec_int& gene_color,
								map_str2vec_exon& exons, map_str2vec_int& exon_color)
{
	cout << "Grouping genes and exons" << endl;

	// Remove exons on chromosomes on which no genes exists.
	set<string> empty_chr;
	for_each_ele_in_group(iter, map_str2vec_exon, exons)
		if (genes.find(iter->first) == genes.end()) empty_chr.insert(iter->first);
	for_each_ele_in_group(iter, set<string>, empty_chr)
		exons.erase(*iter);

	for_each_ele_in_group(iter, map_str2vec_gene, genes)
	{
		string chr = iter->first;
		vector<Gene>& curr_genes = iter->second;
		vector<Exon>& curr_exons = exons[chr];

		vector<int> empty_vec;
		gene_color[chr] = empty_vec;
		exon_color[chr] = empty_vec;

		// The exons should not overlap
		vector<int>& curr_gene_color = gene_color[chr];
		vector<int>& curr_exon_color = exon_color[chr];

		curr_gene_color.resize(curr_genes.size());
		curr_exon_color.resize(curr_exons.size());

		for (unsigned i = 0; i < curr_gene_color.size(); i++)
			curr_gene_color[i] = i;
		for (unsigned i = 0; i < curr_exon_color.size(); i++)
			curr_exon_color[i] = -1;

		vector<bool> b_visited;
		b_visited.assign(curr_genes.size(), false);

		int gene_idx = 0;
		for (unsigned i = 0; i < curr_exons.size(); i++)
		{
			while (gene_idx < curr_genes.size() && curr_exons[i].mStart >= curr_genes[gene_idx].mEnd) gene_idx++;
			if (gene_idx == curr_genes.size()) break;
			b_visited[gene_idx] = true;
			// An exon out of genes
			if (curr_exons[i].mStart < curr_genes[gene_idx].mStart) continue;
			curr_exon_color[i] = curr_gene_color[gene_idx];
			for (unsigned j = gene_idx + 1; j < curr_genes.size(); j++)
			{
				if (curr_exons[i].mStart >= curr_genes[j].mStart)
				{
					b_visited[j] = true;
					curr_gene_color[j] = curr_gene_color[gene_idx];
				}
				else
					break;
			}
		}

		cout << "On chromosome " << chr << endl;
		// Output information
		for (unsigned i = 0; i < curr_exon_color.size(); i++)
			if (-1 == curr_exon_color[i])
			{
#ifdef DEBUG0
				cout << "Expressed segement (" << curr_exons[i].mStart << "," 
					 << curr_exons[i].mEnd << ") is out of genes on chromosome " << chr << endl;
#endif 
			}
		for (unsigned gene_idx = 0; gene_idx < curr_genes.size(); gene_idx++)
		{
			int old_idx = gene_idx;
			while (gene_idx+1 < curr_genes.size() && 
					curr_gene_color[gene_idx] == curr_gene_color[gene_idx+1])
				gene_idx++;
			if (!b_visited[gene_idx]) continue;
#ifdef DEBUG0
			cout << "Genes : ";
			for (int i = old_idx; i <= gene_idx; i++)
				cout << curr_genes[i].mName << ",";
			cout << " are grouped together" << endl;
#endif 
		}
		for (unsigned gene_idx = 0; gene_idx < curr_genes.size(); gene_idx++)
			if (!b_visited[gene_idx])
			{
				curr_gene_color[gene_idx] = -1;
#ifdef DEBUG0
				cout << "Gene " << curr_genes[gene_idx].mName << " has no expressed segments" << endl;
#endif 
			}
	}
}		/* -----  end of method DataProcessor::GroupGeneAndExon  ----- */

/*
 *--------------------------------------------------------------------------------------
 *       Class:  DataProcessor
 *      Method:  LoadTranscripts 
 * Description:  
 *       Param:  tran_file  :  The format of this file is :
 *                   name  chromosome  strand  start_positions  end_positions
 *                   start_positions or end_positions are seperated by commas.
 *      Return:  
 *--------------------------------------------------------------------------------------
 */
static
bool
LoadTranscripts(string tran_file, map_str2vec_gene& transcripts, bool b_strand_specific)
{
	fstream infile;
	infile.open(tran_file.data(), ios::in);
	if (!infile.is_open())
	{
		cerr << "File " << tran_file.data() << " can not be opened" << endl;
		return false;
	}

	// read the data and fill the vector
	vector<string> splitted;
	int line_cnt = 0;
	string line;
	while (getline(infile, line))
	{
		line_cnt++;
		splitted = Utility::Split('\t', line);

		string chr;
		if (b_strand_specific)
			chr = splitted[1]+splitted[2];
		else
			chr = splitted[1];
		vector<string> starts = Utility::Split(',', splitted[5]);
		vector<string> ends = Utility::Split(',', splitted[6]);

		if (starts.size() != ends.size())
		{
			cerr << "DATA ERROR, line " << line_cnt << " : Starts cnt != Ends cnt" << endl;
			continue;
		}

		if (transcripts.find(chr) == transcripts.end())
		{
			vector<Gene> empty_vec;
			transcripts[chr] = empty_vec;
		}

		Gene a_gene;
		a_gene.mName = splitted[0];
		a_gene.mExons.resize(starts.size());
		for (unsigned i = 0; i < starts.size(); i++)
		{
			Exon& exon = a_gene.mExons[i];
			exon.mStart = atoi(starts[i].data());
			exon.mEnd = atoi(ends[i].data());
		}
		a_gene.CalculateRange();

		transcripts[chr].push_back(a_gene);
	}

	infile.close();

	for_each_ele_in_group(iter, map_str2vec_gene, transcripts)
		sort(iter->second.begin(), iter->second.end());

	return true;
}		/* -----  end of method LoadTranscripts  ----- */


/*
 *--------------------------------------------------------------------------------------
 *       Class:  DataProcessor
 *      Method:  ExtractJunctionRef
 * Description:  Extract all junctions in given isoforms
 *       Param:  start_pos_of_first_nt : The position of the first neocliotide on a chromosome.
 *      Return:
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::ExtractJunctionRef(string tran_file, string& refseq_file, int start_pos_of_first_nt, int read_len, int cross_strength)
{
	// cross_len defines the strength of overlap. More specifically,
	// the junction requires at least 'cross_strength' bases
	// are aligned on both sides. Or, each exon contributes cross_len bases
	// to the junction ref.
	int cross_len = read_len - cross_strength;

	if (2 * cross_len < read_len)
		cerr << "WARNING : read is too short compared to the required cross strengh" << endl;

	cout << "Extracting known junction ref sequences from isoforms." << endl;

	map_str2vec_gene genes;
	if (!LoadTranscripts(tran_file, genes, mbStrandSpecific))
		return false;

	map<string, map_int_str> pos2refseq;

	list<int> first_seg, second_seg;
	list<string> chromosome;

	for_each_ele_in_group(iter, map_str2vec_gene, genes)
	{
		string curr_chr = iter->first;

		vector<Gene>& curr_genes = iter->second;

		map_int_str empty_map;
		pos2refseq[curr_chr] = empty_map;

		set<int64> junctions;
		for (unsigned i = 0; i < curr_genes.size(); i++)
		{
			vector<Exon>& exons = curr_genes[i].mExons;
			for (unsigned j = 0; j < exons.size()-1; j++)
			{
				int64 idx = Utility::combine64(exons[j].mEnd, exons[j+1].mStart);
				junctions.insert(idx);
			}
		}

		map_int_str& a_map = pos2refseq[curr_chr];

		for_each_ele_in_group(iter, set<int64>, junctions)
		{
			int end = Utility::get_combined64_first(*iter);
			int pos = end - cross_len;
			first_seg.push_back(pos);
			a_map[pos] = "";

			pos = Utility::get_combined64_second(*iter);
			second_seg.push_back(pos);
			a_map[pos] = "";

			chromosome.push_back(curr_chr);
		}
	}

	if (!ExtractRef(refseq_file, pos2refseq, cross_len, start_pos_of_first_nt)) return false;

	// Output the segments
	list<int>::iterator iter_first = first_seg.begin();
	list<int>::iterator iter_second = second_seg.begin();
	list<string>::iterator iter_chr = chromosome.begin();

	cout << "Total " << first_seg.size() << " junctions have been generated. Writing output" << endl;

	/* :WARNING:03/10/2009 03:01:27 PM:feeldead:  */
	// The junction refseq generated in this way is not exactly right
	// When the length of the exon is less than the length of the reads
	// the junction refseq may be impossible to be constructed.
	while (iter_first != first_seg.end())
	{
		if (pos2refseq.find(*iter_chr) != pos2refseq.end())
		{
			map_int_str& a_map = pos2refseq[*iter_chr];

			const char* cont_first = a_map[*iter_first].data();
			const char* cont_second = a_map[*iter_second].data();

			// Output in the form which is compatible with LoadJunctionReads
			string chr = *iter_chr;
			if (mbStrandSpecific)
			{
				// Remove the strand mark and the end.
				chr = chr.substr(0, chr.length()-1);
			}
			(*mpOutput) << ">" << chr << "|" << *iter_first << "|" << *iter_second << "|" << cross_len << "|Junc" << endl;
			// Becareful about the start index here
			for (int i = 0; i < cross_len; i++)
				(*mpOutput) << cont_first[i];
			// Becareful about the end index here
			for (int i = 0; i < cross_len; i++)
				(*mpOutput) << cont_second[i];
			(*mpOutput) << endl;
		}

		iter_first++;
		iter_second++;
		iter_chr++;
	}

	cout << "Done" << endl;

	return true;
}		/* -----  end of method DataProcessor::ExtractJunctionRef  ----- */


/*
 *--------------------------------------------------------------------------------------
 *       Class:  DataProcessor
 *      Method:  ExtractJunctionRef
 * Description:  Given grouped exons and the refseq, this method extract the junction
 *               refseq.
 *       Param:  start_pos_of_first_nt : The position of the first neocliotide on a chromosome.
 *      Return:
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::ExtractJunctionRef(map_str2vec_exon& exons, map_str2vec_int& exon_color, 
								  map_str2vec_gene& genes, map_str2vec_int& gene_color, 
								  string& refseq_file, int start_pos_of_first_nt,
								  int read_len, int cross_strength)
{
	// cross_len defines the strength of overlap. More specifically,
	// the junction requires at least 'cross_strength' bases
	// are aligned on both sides. Or, each exon contributes cross_len bases
	// to the junction ref.
	int cross_len = read_len - cross_strength;

	if (2 * cross_len < read_len)
		cerr << "WARNING : read is too short compared to the required cross strengh" << endl;

	cout << "Extracting junction ref sequences." << endl;

	map<string, map_int_str> pos2refseq;

	list<int> first_seg, second_seg;
	list<string> chromosome;

	for_each_ele_in_group(iter, map_str2vec_exon, exons)
	{
		string curr_chr = iter->first;

		vector<Exon>& curr_exons = iter->second;
		vector<int>& curr_exon_color = exon_color[curr_chr];

		vector<Gene>& curr_genes = genes[curr_chr];
		vector<int>& curr_gene_color = gene_color[curr_chr];

		// For each color, find the start gene idx and end gene idx
		vector<int> start_gene_idx_indexed_by_color;
		vector<int> end_gene_idx_indexed_by_color;
		start_gene_idx_indexed_by_color.assign(curr_gene_color.size(), curr_gene_color.size());
		end_gene_idx_indexed_by_color.assign(curr_gene_color.size(), 0);
		for (unsigned gene_idx = 0; gene_idx < curr_gene_color.size(); gene_idx++)
		{
			if (-1 == curr_gene_color[gene_idx]) continue;
			if (start_gene_idx_indexed_by_color[curr_gene_color[gene_idx]] > gene_idx)
				start_gene_idx_indexed_by_color[curr_gene_color[gene_idx]] = gene_idx;
			if (end_gene_idx_indexed_by_color[curr_gene_color[gene_idx]] < gene_idx)
				end_gene_idx_indexed_by_color[curr_gene_color[gene_idx]] = gene_idx;
		}

		for (unsigned exon_idx = 0; exon_idx < curr_exons.size(); exon_idx++)
		{
			unsigned old_idx = exon_idx;
			while (exon_idx+1 < curr_exons.size() && 
					curr_exon_color[exon_idx] == curr_exon_color[exon_idx+1])
				exon_idx++;

			if (-1 == curr_exon_color[exon_idx]) continue;

			if (pos2refseq.find(curr_chr) == pos2refseq.end())
			{
				map_int_str a_map;
				pos2refseq[curr_chr] = a_map;
			}

			int start_gene_idx = start_gene_idx_indexed_by_color[curr_exon_color[exon_idx]];
			int end_gene_idx = end_gene_idx_indexed_by_color[curr_exon_color[exon_idx]];

			map_int_str& a_map = pos2refseq[curr_chr];

			// Any two different exons in the involved exons may form a junction.
			for (unsigned first_idx = old_idx; first_idx <= exon_idx ; first_idx++)
			{
				for (unsigned second_idx = first_idx + 1; second_idx <= exon_idx ; second_idx++)
				{
					if (curr_exons[first_idx].mEnd == curr_exons[second_idx].mStart ||
					    (0 == curr_exons[first_idx].mEndType || 1 == curr_exons[second_idx].mStartType))
						continue;

					// if curr two exons do not belong to the same gene, skip
					bool b_succ = false;
					for (int gene_idx = start_gene_idx; gene_idx <= end_gene_idx; gene_idx++)
					{
						if (curr_exons[first_idx].mStart >= curr_genes[gene_idx].mStart &&
						    curr_exons[first_idx].mEnd <= curr_genes[gene_idx].mEnd &&
						    curr_exons[second_idx].mStart >= curr_genes[gene_idx].mStart &&
						    curr_exons[second_idx].mEnd <= curr_genes[gene_idx].mEnd)
						{
							b_succ = true;
							break;
						}
					}

					if (!b_succ) continue;

					int pos = curr_exons[first_idx].mEnd-cross_len;
					first_seg.push_back(pos);
					a_map[pos] = "";

					pos = curr_exons[second_idx].mStart;
					second_seg.push_back(pos);
					a_map[pos] = "";

					chromosome.push_back(curr_chr);
				}
			}
		}
	}

	if (!ExtractRef(refseq_file, pos2refseq, cross_len, start_pos_of_first_nt)) return false;

	// Output the segments
	list<int>::iterator iter_first = first_seg.begin();
	list<int>::iterator iter_second = second_seg.begin();
	list<string>::iterator iter_chr = chromosome.begin();

	cout << "Total " << first_seg.size() << " junctions have been generated. Writing output" << endl;

	/* :WARNING:03/10/2009 03:01:27 PM:feeldead:  */
	// The junction refseq generated in this way is not exactly right
	// When the length of the exon is less than the length of the reads
	// the junction refseq may be impossible to be constructed.
	bool b_out_of_range = false;
	while (iter_first != first_seg.end())
	{
		if (pos2refseq.find(*iter_chr) != pos2refseq.end())
		{
			map_int_str& a_map = pos2refseq[*iter_chr];
			if (a_map[*iter_first].size() < cross_len && !b_out_of_range)
			{
				cerr << "ERROR : on chromosome " << *iter_chr << ", boundary " << *iter_first << " is out of range. "
					 << "Similar errors will be suppressed" << endl;
				b_out_of_range = true;
				continue;
			}
			if (a_map[*iter_second].size() < cross_len && !b_out_of_range)
			{
				cerr << "ERROR : on chromosome " << *iter_chr << ", boundary " << *iter_second << " is out of range. "
					 << "Similar errors will be suppressed" << endl;
				b_out_of_range = true;
				continue;
			}

			const char* cont_first = a_map[*iter_first].data();
			const char* cont_second = a_map[*iter_second].data();

			// Output in the form which is compatible with LoadJunctionReads
			string chr = *iter_chr;
			if (mbStrandSpecific)
			{
				// Remove the strand mark and the end.
				chr = chr.substr(0, chr.length()-1);
			}
			(*mpOutput) << ">" << chr << "|" << *iter_first << "|" << *iter_second << "|" << cross_len << "|Junc" << endl;
			// Becareful about the start index here
			for (int i = 0; i < cross_len; i++)
				(*mpOutput) << cont_first[i];
			// Becareful about the end index here
			for (int i = 0; i < cross_len; i++)
				(*mpOutput) << cont_second[i];
			(*mpOutput) << endl;
		}

		iter_first++;
		iter_second++;
		iter_chr++;
	}

	cout << "Done" << endl;

	return true;
}		/* -----  end of method DataProcessor::ExtractJunctionRef  ----- */

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor 
 *       Method:  ExtractInstances
 *  Description:  Extract instances from given grouped genes and exons. Introns will be
 *                defined by given exons of each isoform or expression level. Please see
 *                Help() for more information.
 *        Param:  
 *       Return:
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::ExtractInstances(map_str2vec_gene& genes, map_str2vec_int& gene_color,
								map_str2vec_exon& exons, map_str2vec_int& exon_color,
								vector<Instance>& all_instances)
{
	int cnt = 0;
	int prob_cnt = 0;

	for_each_ele_in_group(iter, map_str2vec_exon, exons)
	{
		string curr_chr = iter->first;
		vector<Exon>& curr_exons = iter->second;
		vector<int>& curr_exon_color = exon_color[curr_chr];

		if (genes.find(curr_chr) == genes.end()) continue;

		vector<Gene>& curr_genes = genes[curr_chr];
		vector<int>& curr_gene_color = gene_color[curr_chr];

		if (curr_genes.size() == 0) continue;

		int gene_idx = 0;

		for (unsigned exon_idx = 0; exon_idx < curr_exons.size(); exon_idx++)
		{
			unsigned old_exon_idx = exon_idx;
			while (exon_idx+1 < curr_exons.size() && 
					curr_exon_color[exon_idx] == curr_exon_color[exon_idx+1])
				exon_idx++;
			
			if (-1 == curr_exon_color[exon_idx]) continue;

			prob_cnt++;
			all_instances.resize(prob_cnt);
			Instance& an_instance = all_instances[prob_cnt-1];

			an_instance.mInstanceCnt = prob_cnt;
			an_instance.mKnownCnt = 0;

			vector<Exon>& exons_of_this_inst = an_instance.mExons;
			for (int i = old_exon_idx; i <= exon_idx; i++)
				exons_of_this_inst.push_back(curr_exons[i]);

			an_instance.mSegLen.resize(exons_of_this_inst.size());
			for (unsigned i = 0; i < exons_of_this_inst.size(); ++i)
				an_instance.mSegLen[i] = exons_of_this_inst[i].mEnd - exons_of_this_inst[i].mStart;

			while (curr_gene_color[gene_idx] < curr_exon_color[exon_idx]) gene_idx++;
			int old_gene_idx = gene_idx;
			while (gene_idx < curr_gene_color.size() && 
					curr_gene_color[gene_idx] == curr_exon_color[exon_idx]) gene_idx++;
			assert(old_gene_idx < gene_idx);

			vector<Gene>& genes_of_this_inst = an_instance.mGenes;
			genes_of_this_inst.resize(gene_idx - old_gene_idx);
			for (unsigned i = 0; i < genes_of_this_inst.size(); i++)
				genes_of_this_inst[i] = curr_genes[i+old_gene_idx];

			// Find start and end exons
			vector<vector<int> >& all_start_exons = an_instance.mStartExons;
			vector<vector<int> >& all_end_exons = an_instance.mEndExons;
			for (int i = old_gene_idx; i < gene_idx; i++)
			{
				vector<vector<int64> >& TSSs = curr_genes[i].mTSSs;
				vector<vector<int64> >& PASs = curr_genes[i].mPASs;

				for (int j = 0; j < TSSs.size(); j++)
				{
					vector<int> start_exons;
					vector<int> end_exons;
					for (int k = 0; k < TSSs[j].size(); k++)
					{
						int tss = TSSs[j][k];
						// Find the corresponding exon
						int start = old_exon_idx;
						for (; start <= exon_idx; start++)
							if (tss == curr_exons[start].mStart) break;
						if (start <= exon_idx)
							start_exons.push_back(start - old_exon_idx);
					}
					for (int k = 0; k < PASs[j].size(); k++)
					{
						int pas = PASs[j][k];
						// Find the corresponding exon
						int end = old_exon_idx;
						for (; end <= exon_idx; end ++)
							if (pas == curr_exons[end].mEnd) break;
						if (end <= exon_idx)
							end_exons.push_back(end - old_exon_idx);
					}

					if (start_exons.size() > 0 && end_exons.size() > 0)
					{
						bool b_exist = false;
						for (int k = 0; k < all_start_exons.size(); k++)
							if (start_exons == all_start_exons[k] && end_exons == all_end_exons[k])
							{
								b_exist = true;
								break;
							}
						if (!b_exist)
						{
							all_start_exons.push_back(start_exons);
							all_end_exons.push_back(end_exons);
						}
					}
				}
			}

			// Construct known isoforms
			vector<vector<bool> >& isoforms = an_instance.mIsoforms;

			bool b_exons_provided = false;
			if (curr_genes[old_gene_idx].mExons.size() > 0)
				b_exons_provided = true;

			if (b_exons_provided)
			{
				isoforms.resize(gene_idx - old_gene_idx);
				for (unsigned i = 0; i < isoforms.size(); i++)
				{
					Gene& a_gene = curr_genes[i+old_gene_idx];
					vector<bool> an_iso;
					an_iso.assign(exons_of_this_inst.size(), false);
					for (unsigned j = 0; j < an_iso.size(); j++)
					{
						bool b_exist = false;
						for (unsigned k = 0; k < a_gene.mExons.size(); k++)
							if (a_gene.mExons[k].mStart <= exons_of_this_inst[j].mStart &&
								a_gene.mExons[k].mEnd >= exons_of_this_inst[j].mEnd)
							{
								b_exist = true;
								break;
							}
						an_iso[j] = b_exist;
					}
					isoforms[i] = an_iso;
				}
				an_instance.mKnownCnt = isoforms.size();
				an_instance.mIsoExp.resize(isoforms.size(), 0);
			}
		}
	}

	return true;
}		/* -----  end of method DataProcessor::ExtractInstances  ----- */


/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor 
 *       Method:  RemoveIntrons
 *  Description:  
 *        Param:  
 *       Return:  The number of expressed segments
 *--------------------------------------------------------------------------------------
 */
int
DataProcessor::RemoveIntrons(Instance& an_instance, bool b_use_provided_exons, double noise_level)
{
	vector<vector<bool> >& isoforms = an_instance.mIsoforms;
	vector<Exon>& exons_of_this_inst = an_instance.mExons;

	// Remove introns. If known isoforms are provided, introns are defined as segments that
	// have not been covered by known isoforms. If known isoforms are not provided, introns
	// are segments that have low expression levels. No start/end segments could be introns
	//
	// Calculate which segments are introns
	vector<bool> b_intron;
	vector<vector<int> >& all_start_exons = an_instance.mStartExons;
	vector<vector<int> >& all_end_exons = an_instance.mEndExons;

	b_intron.assign(exons_of_this_inst.size(), true);
	for (unsigned i = 0; i < all_start_exons.size(); i++)
	{
		for (unsigned j = 0; j < all_start_exons[i].size(); j++)
			b_intron[all_start_exons[i][j]] = false;
		for (unsigned j = 0; j < all_end_exons[i].size(); j++)
			b_intron[all_end_exons[i][j]] = false;
	}

	if (isoforms.size() > 0 && b_use_provided_exons)
	{
		for (unsigned i = 0; i < isoforms.size(); i++)
			for (unsigned j = 0; j < isoforms[i].size(); j++)
				if (isoforms[i][j]) b_intron[j] = false;
	}
	else
	{
		for (unsigned i = 0; i < an_instance.mSegLen.size(); i++)
		{
			// In RPKM
			double exp = an_instance.mShortReadGroup.SegExp(an_instance.mSegLen[i], i);
			if (exp > noise_level) b_intron[i] = false;
		}
	}

	// Remove introns. 
	vector<int> new2old_idx;
	vector<int> old2new_idx;
	new2old_idx.resize(b_intron.size());
	old2new_idx.assign(b_intron.size(), -1);
	int new_size = 0;
	for (unsigned i = 0; i < b_intron.size(); i++)
	{
		if (!b_intron[i])
		{
			old2new_idx[i] = new_size;
			new2old_idx[new_size++] = i;
		}
		else
		{
#ifdef DEBUG0
			cout << "Intron :" << exons_of_this_inst[i].mStart << "," 
				 << exons_of_this_inst[i].mEnd << " is removed" << endl;
#endif
		}
	}
	new2old_idx.resize(new_size);

	if (new_size != b_intron.size())
	{
		for (unsigned i = 0; i < new_size; i++)
			exons_of_this_inst[i] = exons_of_this_inst[new2old_idx[i]];
		exons_of_this_inst.resize(new_size);
		for (unsigned i = 0; i < all_start_exons.size(); i++)
		{
			for (unsigned j = 0; j < all_start_exons[i].size(); j++)
			{
				// NOTE: it is possible that some start/end exon has been defined as intron, 
				// i.e., old2new_idx[?] = -1
				if (-1 != old2new_idx[all_start_exons[i][j]])
					all_start_exons[i][j] = old2new_idx[all_start_exons[i][j]];
			}
			for (unsigned j = 0; j < all_end_exons[i].size(); j++)
			{
				if (-1 != old2new_idx[all_end_exons[i][j]])
					all_end_exons[i][j] = old2new_idx[all_end_exons[i][j]];
			}
			// Here, it is possible that all_start_exons[i] or all_end_exons[i] is empty
		}

		an_instance.mSegLen.resize(exons_of_this_inst.size());
		for (unsigned i = 0; i < exons_of_this_inst.size(); ++i)
			an_instance.mSegLen[i] = exons_of_this_inst[i].mEnd - exons_of_this_inst[i].mStart;

		for (unsigned i = 0; i < isoforms.size(); i++)
		{
			// Check whether the intron removing is consistent with known isoforms
			vector<bool>& an_iso = isoforms[i];
			bool b_consistent = true;
			for (unsigned j = 0; j < an_iso.size(); j++)
				if (an_iso[j] && b_intron[j])
				{
					cerr << "Intron removing is inconsistent with known isoform " << an_instance.mGenes[i].mName << endl;
					b_consistent = false;
					break;
				}

			// Set the isoform as empty such that no prediction will matched to it.
			if (!b_consistent)  
				an_iso.assign(new_size, false);
			else
			{
				for (unsigned j = 0; j < new_size; j++)
					an_iso[j] = an_iso[new2old_idx[j]];
				an_iso.resize(new_size);
			}
		}

		vector<ShortRead>& short_reads = an_instance.mShortReadGroup.mShortReads;

		// Modify the patterns of short reads
		for (unsigned i = 0; i < short_reads.size(); ++i)
			short_reads[i].Shrink(old2new_idx);
	}

	return new_size;
}		/* -----  end of method DataProcessor::RemoveIntrons  ----- */

/*
 *--------------------------------------------------------------------------------------
 *        Class:  DataProcessor 
 *       Method:  HighIsoforms
 *  Description:  Given a set of isoforms and mapping information, find out highly expressed
 *                isoforms.
 *        Param:  
 *       Return:
 *--------------------------------------------------------------------------------------
 */
bool
DataProcessor::HighIsoforms(Instance& an_instance, bool b_check_start_end)
{
	vector<Exon>& exons_of_this_inst = an_instance.mExons;

//	for (unsigned i = 0; i < an_instance.mIsoforms.size(); ++i)
//	{
//		bool b_high = true;
//		double min_exp = 100000;
//
//		vector<bool>& an_iso = an_instance.mIsoforms[i];
//
//		int start = -1;
//		double read_cnt;
//		int length;
//		for (unsigned j = 0; j < an_iso.size(); j++)
//		{
//			if (an_iso[j])
//			{
//				if (-1 != start)
//				{
//					// Check junction
//					if (exons_of_this_inst[start].mEnd != exons_of_this_inst[j].mStart)
//					{
//						double exp = an_instance.JuncExp(start, j);
//						
//						if ( exp <= 0)
//						{
//							b_high = false;
//							break;
//						}
//						else if (min_exp > exp) min_exp = exp;
//						
//					}
//					else
//					{
//						read_cnt += an_instance.ReadCnt(start);
//						length += an_instance.mSegLen(start);
//					}
//				}
//				start = j;
//			}
//		}
//
//		if (b_high)
//		{
//			int start_idx = 0;
//			int end_idx = orig_exons.size();
//			if (!b_check_start_end)
//			{
//				start_idx++;
//				end_idx--;
//			}
//			// Does not check the first and the last exon
//			for (int j = start_idx; j < end_idx; j++)
//			{
//				Exon& an_exon = orig_exons[j];
//
//				// Check expression level
//				int read_cnt = 0;
//				int length = 0;
//				for (unsigned k = 0; k < exons_of_this_inst.size(); k++)
//					if (an_exon.mStart <= exons_of_this_inst[k].mStart &&
//						an_exon.mEnd >= exons_of_this_inst[k].mEnd)
//					{
//						read_cnt += exons_of_this_inst[k].mStartCnt;
//						length += exons_of_this_inst[k].mEnd - exons_of_this_inst[k].mStart;
//						length -= mCrossStrength - 1;
//					}
//
//				if (length < 0) continue;
//
//				double exp = read_cnt * 1000000.0 / all_single_end_read_cnt * 1000.0 / length;
//
//				if (min_exp > exp)
//					min_exp = exp;
//			}
//
//			(*mpOutput) << min_exp << "\t";
//			curr_genes[i+old_gene_idx].Write(mpOutput);
//		}
//	}
	return true;
}		/* -----  end of method DataProcessor::HighIsoforms----- */

