/*
 * =====================================================================================
 *
 *       Filename:  InstanceReader.h
 *
 *    Description:  The header file for class InstanceReader
 *
 *        Version:  1.0
 *        Created:  04/09/2009 01:21:42 PM
 *       Revision:  none
 *       Compiler:  gcc
 *
 *         Author:  Jianxing Feng (feeldead), feeldead@gmail.com
 *        Company:  THU
 *
 * =====================================================================================
 */

#ifndef InstanceReader_H 
#define InstanceReader_H

#include <string>
#include <iostream>
#include "InstanceHandler.h"
#include "InstanceGenerator.h"

using namespace std;

/*
 * =====================================================================================
 *        Class:  InstanceReader
 *  Description:  Read instances from file
 * =====================================================================================
 */
class InstanceReader : public InstanceGenerator
{
	public:
		/* ====================  LIFECYCLE     ======================================= */
		InstanceReader (string input_file)
		{
			mInputFile = input_file;
		};                             /* constructor */

		virtual ~InstanceReader (){};                             /* constructor */

		virtual
		bool
		Generate()
		{
			fstream infile;
			infile.open(mInputFile.data(), ios::in);
			if (!infile.is_open())
			{
				cerr << "File " << mInputFile.data() << " can not be opened" << endl;
				return false;
			}

			vector<string> splitted;
			int line_cnt = 0;
			int loaded_cnt = 0;
			string line;
			getline(infile, line);
			while (true)
			{
				line_cnt++;
				Instance an_instance;

				int& instance_cnt                      = an_instance.mInstanceCnt;
				vector<int>& set_sizes                 = an_instance.mSetSizes;
				vector<double>& sample_cnt                = an_instance.mSampleCnt;
				vector<vector<bool> >& isoforms        = an_instance.mIsoforms;
				vector<vector<double> >& splice_read_cnt  = an_instance.mSpliceReadCnt;
				vector<vector<double> >& splice_read_cnt_pe  = an_instance.mSpliceReadCntPE;
				vector<double>& iso_exp                = an_instance.mIsoExp;
				vector<double>& fit_value              = an_instance.mFitValue;
				vector<Exon>& exons                    = an_instance.mExons;
				vector<Gene>& genes                    = an_instance.mGenes;
				int& known_cnt                         = an_instance.mKnownCnt;
				int& read_len                          = an_instance.mReadLen;
				int& read_cnt                          = an_instance.mReadCnt;
				int& cross_strength                    = an_instance.mCrossStrength;
				double& noise_level                    = an_instance.mNoiseLevel;

				an_instance.mPvalue = 1;

				splitted = Utility::Split('\t', line);
				if (splitted[0] != "Instance")
				{
					cerr << "ERROR : line " << line_cnt << " 'Instance' is expected" << endl;
					cerr << line << endl;
				}

				instance_cnt = atoi(splitted[1].data());

				int set_cnt = atoi(splitted[2].data());
				int iso_cnt = atoi(splitted[3].data());
				known_cnt = atoi(splitted[4].data());

				string comments;
				for (int i = 5; i < splitted.size(); i++)
					comments += splitted[i];
				set_sizes.resize(set_cnt);
				sample_cnt.resize(set_cnt);
				isoforms.resize(iso_cnt);
				iso_exp.resize(iso_cnt);
				fit_value.resize(set_cnt);

				getline(infile, line); line_cnt++;
				splitted = Utility::Split('\t', line);
				read_len = atoi(splitted[1].data());

				getline(infile, line); line_cnt++;
				splitted = Utility::Split('\t', line);
				read_cnt = atoi(splitted[1].data());

				getline(infile, line); line_cnt++;
				splitted = Utility::Split('\t', line);
				cross_strength = atoi(splitted[1].data());

				getline(infile, line); line_cnt++;
				splitted = Utility::Split('\t', line);
				noise_level = atof(splitted[1].data());

				getline(infile, line); line_cnt++;
				splitted = Utility::Split('\t', line);
				if ("genes     :" != splitted[0])
					cerr << "ERROR : 'genes     :' are expected at line " << line_cnt << endl;
				genes.resize(atoi(splitted[1].data()));
				for (unsigned i = 0; i < genes.size(); i++)
				{
					getline(infile, line); line_cnt++;
					genes[i].Read(line);
				}

				getline(infile, line); line_cnt++;
				splitted = Utility::Split('\t', line);
				if ("exp segs  :" != splitted[0])
					cerr << "ERROR : 'exp segs  :' are expected at line " << line_cnt << endl;
				exons.resize(set_cnt);
				for (int i = 0; i < set_cnt; i++)
				{
					getline(infile, line); line_cnt++;
					exons[i].Read(line);
				}

				getline(infile, line); line_cnt++;
				splitted = Utility::Split('\t', line);

				if ("fit value :" == splitted[0])
				{
					// sample_cnt
					for (int i = 0; i < set_cnt; i++)
						fit_value[i] = atof(splitted[i+1].data());
					getline(infile, line); line_cnt++;
					splitted = Utility::Split('\t', line);
				}

				// sample_cnt
				for (int i = 0; i < set_cnt; i++)
					sample_cnt[i] = atof(splitted[i+1].data());

				getline(infile, line); line_cnt++;
				splitted = Utility::Split('\t', line);
				for (int i = 0; i < set_cnt; i++)
					set_sizes[i] = atoi(splitted[i+1].data());

				for (int i = 0; i < iso_cnt; i++)
				{
					getline(infile, line); line_cnt++;
					splitted = Utility::Split('\t', line);
					isoforms[i].resize(set_cnt);
					for (int j = 0; j < set_cnt; j++)
					{
						if (atoi(splitted[j+1].data()) == 0)
							isoforms[i][j] = false;
						else
							isoforms[i][j] = true;
					}
					iso_exp[i] = atof(splitted[set_cnt+1].data());
				}

				splice_read_cnt.resize(set_cnt);
				for (int i = 0; i < set_cnt; i++)
				{
					splice_read_cnt[i].resize(set_cnt);
					getline(infile, line); line_cnt++;
					splitted = Utility::Split('\t', line);
					for (int j = 0; j < set_cnt; j++)
						splice_read_cnt[i][j] = atof(splitted[j+1].data());
				}

				bool b_more = getline(infile, line); line_cnt++;
				splitted = Utility::Split('\t', line);

				// Load PE information, optional
				bool b_pe_info = false;
				while (b_more && "PE info:" == splitted[0])
				{
					an_instance.mPEInfo.resize(an_instance.mPEInfo.size()+1);
					PEInfo& pe_info = an_instance.mPEInfo[an_instance.mPEInfo.size()-1];
					pe_info.mReadLen = atoi(splitted[1].data());
					pe_info.mSpanMean = atof(splitted[2].data());
					pe_info.mSpanStd = atof(splitted[3].data());
					pe_info.mReadCnt = atoi(splitted[4].data());

					// Allows three standard deviations. About 99.7%
					pe_info.mSpanLow = (int)(pe_info.mSpanMean - 3 * pe_info.mSpanStd);
					pe_info.mSpanHigh = (int)(pe_info.mSpanMean + 3 * pe_info.mSpanStd);

					int part_comb_cnt = atoi(splitted[5].data());
					pe_info.mPartComb.resize(part_comb_cnt);
					pe_info.mPartCombDup.resize(part_comb_cnt);
					for (int i = 0; i < part_comb_cnt; i++)
					{
						pe_info.mPartComb[i].resize(set_cnt);
						getline(infile, line); line_cnt++;
						splitted = Utility::Split('\t', line);
						for (int j = 0; j < set_cnt; j++)
						{
							if (atoi(splitted[j].data()) == 0)
								pe_info.mPartComb[i][j] = false;
							else
								pe_info.mPartComb[i][j] = true;
						}
						pe_info.mPartCombDup[i] = atoi(splitted[set_cnt].data());
					}
					b_more = getline(infile, line); line_cnt++;
					splitted = Utility::Split('\t', line);
					b_pe_info = true;
				}

				if (b_pe_info)
				{
					splice_read_cnt_pe.resize(set_cnt);
					for (int i = 0; i < set_cnt; i++)
					{
						splice_read_cnt_pe[i].resize(set_cnt);
						for (int j = 0; j < set_cnt; j++)
							splice_read_cnt_pe[i][j] = atof(splitted[j+1].data());

						b_more = getline(infile, line); line_cnt++;
						splitted = Utility::Split('\t', line);
					}
				}

				// Load start end exons, optional
				if (b_more && "SE exons:" == splitted[0])
				{
					int se_pair_cnt = atoi(splitted[1].data());
					an_instance.mStartExons.resize(se_pair_cnt);
					an_instance.mEndExons.resize(se_pair_cnt);
					for (int i = 0; i < se_pair_cnt; i++)
					{
						getline(infile, line); line_cnt++;
						splitted = Utility::Split('\t', line);

						vector<string> indexes = Utility::Split(',', splitted[0]);
						for (unsigned j = 0; j < indexes.size(); j++)
							an_instance.mStartExons[i].push_back(atoi(indexes[j].data()));

						indexes = Utility::Split(',', splitted[1]);
						for (unsigned j = 0; j < indexes.size(); j++)
							an_instance.mEndExons[i].push_back(atoi(indexes[j].data()));
					}
					b_more = getline(infile, line); line_cnt++;
					splitted = Utility::Split('\t', line);
				}

				loaded_cnt++;
				if (mpHandler) mpHandler->OnInstance(an_instance);
				if (!b_more) break;
			}

			cout << "Totally, " << loaded_cnt << " instances are loaded" << endl;
			infile.close();
		};


		//--------------------------------------------------------------------------------------
		//       Class:  InstanceReader
		//      Method:  ReadAllInstances
		// Description:  This function read all the instances into an array from a file.
		//               It is very similar to Generate.
		//--------------------------------------------------------------------------------------
		bool
		ReadAllInstances(vector<Instance>& all_instances)
		{
			fstream infile;
			infile.open(mInputFile.data(), ios::in);
			if (!infile.is_open())
			{
				cerr << "File " << mInputFile.data() << " can not be opened" << endl;
				return false;
			}

			vector<string> splitted;
			int line_cnt = 0;
			int loaded_cnt = 0;
			string line;
			getline(infile, line);
			while (true)
			{
				line_cnt++;
				all_instances.resize(all_instances.size()+1);
				Instance& an_instance = all_instances[all_instances.size()-1];

				int& instance_cnt                      = an_instance.mInstanceCnt;
				vector<int>& set_sizes                 = an_instance.mSetSizes;
				vector<double>& sample_cnt                = an_instance.mSampleCnt;
				vector<vector<bool> >& isoforms        = an_instance.mIsoforms;
				vector<vector<double> >& splice_read_cnt  = an_instance.mSpliceReadCnt;
				vector<vector<double> >& splice_read_cnt_pe  = an_instance.mSpliceReadCntPE;
				vector<double>& iso_exp                = an_instance.mIsoExp;
				vector<double>& fit_value              = an_instance.mFitValue;
				vector<Exon>& exons                    = an_instance.mExons;
				vector<Gene>& genes                    = an_instance.mGenes;
				int& known_cnt                         = an_instance.mKnownCnt;
				int& read_len                          = an_instance.mReadLen;
				int& read_cnt                          = an_instance.mReadCnt;
				int& cross_strength                    = an_instance.mCrossStrength;
				double& noise_level                    = an_instance.mNoiseLevel;

				an_instance.mPvalue = 1;

				splitted = Utility::Split('\t', line);
				if (splitted[0] != "Instance")
				{
					cerr << "ERROR : line " << line_cnt << " 'Instance' is expected" << endl;
					cerr << line << endl;
				}

				instance_cnt = atoi(splitted[1].data());

				int set_cnt = atoi(splitted[2].data());
				int iso_cnt = atoi(splitted[3].data());
				known_cnt = atoi(splitted[4].data());

				string comments;
				for (int i = 5; i < splitted.size(); i++)
					comments += splitted[i];
				set_sizes.resize(set_cnt);
				sample_cnt.resize(set_cnt);
				isoforms.resize(iso_cnt);
				iso_exp.resize(iso_cnt);
				fit_value.resize(set_cnt);

				getline(infile, line); line_cnt++;
				splitted = Utility::Split('\t', line);
				read_len = atoi(splitted[1].data());

				getline(infile, line); line_cnt++;
				splitted = Utility::Split('\t', line);
				read_cnt = atoi(splitted[1].data());

				getline(infile, line); line_cnt++;
				splitted = Utility::Split('\t', line);
				cross_strength = atoi(splitted[1].data());

				getline(infile, line); line_cnt++;
				splitted = Utility::Split('\t', line);
				noise_level = atof(splitted[1].data());

				getline(infile, line); line_cnt++;
				splitted = Utility::Split('\t', line);
				if ("genes     :" != splitted[0])
					cerr << "ERROR : 'genes     :' are expected at line " << line_cnt << endl;
				genes.resize(atoi(splitted[1].data()));
				for (unsigned i = 0; i < genes.size(); i++)
				{
					getline(infile, line); line_cnt++;
					genes[i].Read(line);
				}

				getline(infile, line); line_cnt++;
				splitted = Utility::Split('\t', line);
				if ("exp segs  :" != splitted[0])
					cerr << "ERROR : 'exp segs  :' are expected at line " << line_cnt << endl;

				exons.resize(set_cnt);
				for (int i = 0; i < set_cnt; i++)
				{
					getline(infile, line); line_cnt++;
					exons[i].Read(line);
				}

				getline(infile, line); line_cnt++;
				splitted = Utility::Split('\t', line);

				if ("fit value :" == splitted[0])
				{
					// sample_cnt
					for (int i = 0; i < set_cnt; i++)
						fit_value[i] = atof(splitted[i+1].data());
					getline(infile, line); line_cnt++;
					splitted = Utility::Split('\t', line);
				}

				// sample_cnt
				for (int i = 0; i < set_cnt; i++)
					sample_cnt[i] = atof(splitted[i+1].data());

				getline(infile, line); line_cnt++;
				splitted = Utility::Split('\t', line);
				for (int i = 0; i < set_cnt; i++)
					set_sizes[i] = atoi(splitted[i+1].data());

				for (int i = 0; i < iso_cnt; i++)
				{
					getline(infile, line); line_cnt++;
					splitted = Utility::Split('\t', line);
					isoforms[i].resize(set_cnt);
					for (int j = 0; j < set_cnt; j++)
					{
						if (atoi(splitted[j+1].data()) == 0)
							isoforms[i][j] = false;
						else
							isoforms[i][j] = true;
					}
					iso_exp[i] = atof(splitted[set_cnt+1].data());
				}

				splice_read_cnt.resize(set_cnt);
				for (int i = 0; i < set_cnt; i++)
				{
					splice_read_cnt[i].resize(set_cnt);
					getline(infile, line); line_cnt++;
					splitted = Utility::Split('\t', line);
					for (int j = 0; j < set_cnt; j++)
						splice_read_cnt[i][j] = atof(splitted[j+1].data());
				}

				bool b_more = getline(infile, line); line_cnt++;
				splitted = Utility::Split('\t', line);

				// Load PE information, optional
				bool b_pe_info = false;
				while (b_more && "PE info:" == splitted[0])
				{
					an_instance.mPEInfo.resize(an_instance.mPEInfo.size()+1);
					PEInfo& pe_info = an_instance.mPEInfo[an_instance.mPEInfo.size()-1];
					pe_info.mReadLen = atoi(splitted[1].data());
					pe_info.mSpanMean = atof(splitted[2].data());
					pe_info.mSpanStd = atof(splitted[3].data());
					pe_info.mReadCnt = atoi(splitted[4].data());

					// Allows three standard deviations. About 99.7%
					pe_info.mSpanLow = (int)(pe_info.mSpanMean - 3 * pe_info.mSpanStd);
					pe_info.mSpanHigh = (int)(pe_info.mSpanMean + 3 * pe_info.mSpanStd);

					int part_comb_cnt = atoi(splitted[5].data());
					pe_info.mPartComb.resize(part_comb_cnt);
					pe_info.mPartCombDup.resize(part_comb_cnt);
					for (int i = 0; i < part_comb_cnt; i++)
					{
						pe_info.mPartComb[i].resize(set_cnt);
						getline(infile, line); line_cnt++;
						splitted = Utility::Split('\t', line);
						for (int j = 0; j < set_cnt; j++)
						{
							if (atoi(splitted[j].data()) == 0)
								pe_info.mPartComb[i][j] = false;
							else
								pe_info.mPartComb[i][j] = true;
						}
						pe_info.mPartCombDup[i] = atoi(splitted[set_cnt].data());
					}
					b_more = getline(infile, line); line_cnt++;
					splitted = Utility::Split('\t', line);
					b_pe_info = true;
				}

				if (b_pe_info)
				{
					splice_read_cnt_pe.resize(set_cnt);
					for (int i = 0; i < set_cnt; i++)
					{
						splice_read_cnt_pe[i].resize(set_cnt);
						for (int j = 0; j < set_cnt; j++)
							splice_read_cnt_pe[i][j] = atof(splitted[j+1].data());

						b_more = getline(infile, line); line_cnt++;
						splitted = Utility::Split('\t', line);
					}
				}

				// Load start end exons, optional
				if (b_more && "SE exons:" == splitted[0])
				{
					int se_pair_cnt = atoi(splitted[1].data());
					an_instance.mStartExons.resize(se_pair_cnt);
					an_instance.mEndExons.resize(se_pair_cnt);
					for (int i = 0; i < se_pair_cnt; i++)
					{
						getline(infile, line); line_cnt++;
						splitted = Utility::Split('\t', line);

						vector<string> indexes = Utility::Split(',', splitted[0]);
						for (unsigned j = 0; j < indexes.size(); j++)
							an_instance.mStartExons[i].push_back(atoi(indexes[j].data()));

						indexes = Utility::Split(',', splitted[1]);
						for (unsigned j = 0; j < indexes.size(); j++)
							an_instance.mEndExons[i].push_back(atoi(indexes[j].data()));
					}
					b_more = getline(infile, line); line_cnt++;
					splitted = Utility::Split('\t', line);
				}

				loaded_cnt++;
				if (!b_more) break;
			}

			cout << "Totally, " << loaded_cnt << " instances are loaded" << endl;
			infile.close();

		}

	private:
		string mInputFile;

}; /* -----  end of class InstanceReader  ----- */

#endif
