/*
 * =====================================================================================
 *
 *       Filename:  InstanceStatistics.h
 *
 *    Description:  This is the header file for class InstanceStatistics
 *
 *        Version:  1.0
 *        Created:  04/20/2009 04:34:48 PM
 *       Revision:  none
 *       Compiler:  gcc
 *
 *         Author:  Jianxing Feng (feeldead), feeldead@gmail.com
 *        Company:  THU
 *
 * =====================================================================================
 */

#ifndef InstanceStatistics_H 
#define InstanceStatistics_H

#include <string>
#include <iostream>
#include "InstanceHandler.h"
#include "Utility2.h"

using namespace std;

/*
 * =====================================================================================
 *        Class:  InstanceStatistics
 *  Description:  Do various statistics on all the instances
 * =====================================================================================
 */
class InstanceStatistics : public InstanceHandler
{
	public:
		/* ====================  LIFECYCLE     ======================================= */
		InstanceStatistics (ostream* p_output = NULL) : InstanceHandler(p_output)
		{
		};                             /* constructor */

		virtual ~InstanceStatistics (){};                             /* constructor */


		virtual
		void
		Initialize()
		{
			mRemainedIsoCnt = 0;
			mBadInstance = 0;
			mIsoLenRatio.assign(100, 0);
			mTotalGene = mTotalIso = mTotalExon = 0;
			mExonUB = 41;
			mIsoUB= 11;
			mExonVSIso.resize(mExonUB+1);
			for (int i = 0; i < mExonVSIso.size(); i++)
				mExonVSIso[i].resize(mIsoUB+1);

			mTotalJuncReadCnt = 0;
			mTotalExonReadCnt = 0;
			mHighCnt = 0;
			mExpOfGenes.resize(0);
			mFullyContained = 0;
			mTSSCnt = 0;
			mPASCnt = 0;
			mTSS_PASCnt = 0;
			mExonExpDist.assign(31, 0);

			InstanceHandler::Initialize();	
		};

		virtual
		void
		OnInstance(Instance& an_instance)
		{
			int 					&instance_cnt =     an_instance.mInstanceCnt;
			vector<int> 			&set_sizes =        an_instance.mSetSizes;
			vector<double> 			&sample_cnt =       an_instance.mSampleCnt;
			vector<vector<bool> > 	&isoforms =         an_instance.mIsoforms;
			vector<vector<double> > &splice_read_cnt =  an_instance.mSpliceReadCnt;
			vector<Gene> 			&genes =   		    an_instance.mGenes;
			double                  &noise_level =      an_instance.mNoiseLevel;
			int                     &known_cnt =        an_instance.mKnownCnt;

			bool succ = false;
			for (int i = 0; i < set_sizes.size(); i++)
				if (set_sizes[i] >= an_instance.mReadLen) 
				{
					succ = true;
					break;
				}
			if (!succ) mBadInstance++;

			mTotalIso += isoforms.size();
			mTotalGene++;
			mTotalExon += an_instance.mExons.size();

			if (set_sizes.size() <= 40 && set_sizes.size() > 1)
				mRemainedIsoCnt += isoforms.size();

			int ss = set_sizes.size(); 
			if (ss > mExonUB)
				ss = mExonUB;
			int is = isoforms.size(); 
			if (is > mIsoUB)
				is = mIsoUB;

			mExonVSIso[ss][is]++;

			for (int i = 0; i < an_instance.mIsoforms.size(); i++)
			{
				int cnt = 0;
				for (int j = 0; j < an_instance.mIsoforms[i].size(); j++)
					if (an_instance.mIsoforms[i][j]) cnt++;

				for (int j = 0; j < 20; j++)
				{
					if ((double)cnt / set_sizes.size() >= j * 0.05)
						mIsoLenRatio[j]++;
				}
			}


			for (int i = 0; i < an_instance.mSampleCnt.size(); i++)
				mTotalExonReadCnt += an_instance.mSampleCnt[i];

			for (int i = 0; i < set_sizes.size(); i++)
				for (int j = 0; j < set_sizes.size(); j++)
					mTotalJuncReadCnt += an_instance.mSpliceReadCnt[i][j];

			double read_cnt = 0;
			for (int i = 0; i < an_instance.mSampleCnt.size(); i++)
				read_cnt += an_instance.mSampleCnt[i];

			double gene_len = 0;
			for (int i = 0; i < an_instance.mSetSizes.size(); i++)
				gene_len += an_instance.mSetSizes[i];

			if (gene_len == 0)
			{
				read_cnt = 0;
				gene_len = 1;
			}
			
			if (read_cnt / gene_len >= 0.04) mHighCnt++;

			for (unsigned i = 0; i < an_instance.mIsoExp.size(); i++)
			{
				int len = 0;
				for (unsigned j = 0; j < an_instance.mSetSizes.size(); j++)
					if (an_instance.mIsoforms[i][j]) len += an_instance.mSetSizes[j];
				mWeightedIsoLen += len * an_instance.mIsoExp[i];
			}

			for (unsigned i = 0; i < an_instance.mIsoforms.size(); i++)
			{
				// whether is i fully contained in j?
				bool b_fully_contained;
				for (unsigned j = 0; j < an_instance.mIsoforms.size(); j++)
				{
					if (i == j) continue;
					b_fully_contained = true;
					for (unsigned k = 0; k < an_instance.mSetSizes.size(); k++)
						if (an_instance.mIsoforms[i][k] && !an_instance.mIsoforms[j][k])
						{
							b_fully_contained = false;
							break;
						}
					if (b_fully_contained) break;
				}
				if (b_fully_contained)
					mFullyContained++;
			}

			mTSS_PASCnt += an_instance.mStartExons.size();
			set<int> start_exon;
			set<int> end_exon;
			for (unsigned i = 0; i < an_instance.mStartExons.size(); i++)
			{
				for (unsigned j = 0; j < an_instance.mStartExons[i].size(); j++)
					start_exon.insert(an_instance.mStartExons[i][j]);
				for (unsigned j = 0; j < an_instance.mEndExons[i].size(); j++)
					end_exon.insert(an_instance.mEndExons[i][j]);
			}
			mTSSCnt += start_exon.size();
			mPASCnt += end_exon.size();

			for (unsigned i = 0; i < an_instance.mSetSizes.size(); i++)
			{
				double exp = ((double)an_instance.mSampleCnt[i]) / an_instance.mSetSizes[i];
				int idx = (int) (exp * 10);
				if (idx > mExonExpDist.size()-1) idx = mExonExpDist.size()-1;

				mExonExpDist[idx]++;
			}

			//(*mpOutput) << read_cnt / gene_len << endl;
		}/* -----  end of method OnInstance  ----- */

		virtual
		void
		CleanUp()
		{
			Output();
			InstanceHandler::Initialize();	
		};


		void
		Output()
		{
			cout << "Iso length distribution : " << endl;
			for (int i = 0; i < 20; i++)
				cout << " >= " << i * 0.05 << "\t" << (double)mIsoLenRatio[i] / mTotalIso << endl;
			cout << "Exon exp distribution : " << endl;
			for (unsigned i = 0; i < mExonExpDist.size(); i++)
				cout << i*0.1 << "\t" << mExonExpDist[i] << endl;
			cout << "Total gene cnt      = " << mTotalGene<< endl;
			cout << "Total isoform   cnt = " << mTotalIso << endl;
			cout << "Total exon      cnt = " << mTotalExon << endl;
			cout << "Total fully contained isoform   cnt = " << mFullyContained<< endl;
			cout << "Total remained isoform cnt = " << mRemainedIsoCnt << endl;
			cout << "Total exon read cnt = " << mTotalExonReadCnt << endl;
			cout << "Total junc read cnt = " << mTotalJuncReadCnt << endl;
			cout << "Total read cnt      = " << mTotalJuncReadCnt + mTotalExonReadCnt << endl;
			cout << "Total weighted isoform len = " << mWeightedIsoLen << endl;
			cout << "Total TSSs = " << mTSSCnt << endl;
			cout << "Total PASs = " << mPASCnt << endl;
			cout << "Total TSS-PASs = " << mTSS_PASCnt << endl;
			cout << "Read / Base (relative) = " << ((double)(mTotalJuncReadCnt + mTotalExonReadCnt)) / mWeightedIsoLen << endl;
			cout << mHighCnt << " instances have expression level about 0.04." << endl;

			cout << "exon vs isoform" << endl;
			cout << setw(6) << " ";
			for (int i = 1; i <= mIsoUB; i++)
				cout << setw(6) << i;

			cout << setw(6) << "sum" << endl;
			int tot_cnt = 0;
			vector<int> col_sum;
			col_sum.assign(mIsoUB+1, 0);
			for (int i = 1; i <= mExonUB; i++)
			{
				int row_sum = 0;
				cout << setw(6) << i;
				for (int j = 1; j <= mIsoUB; j++)
				{
					col_sum[j] += mExonVSIso[i][j];
					row_sum += mExonVSIso[i][j];
					cout << setw(6) << mExonVSIso[i][j];
				}
				cout << setw(6) << row_sum << endl;
				tot_cnt += row_sum;
			}
			cout << setw(6) << "sum";
			for (int i = 1; i <= mIsoUB; i++)
				cout << setw(6) << col_sum[i];
			cout << setw(6) << tot_cnt << endl;
			cout << "Total bad instances = " << mBadInstance << endl;
		}

	protected:

	private:
		int mRemainedIsoCnt;
		int mBadInstance;
		vector<int> mIsoLenRatio;

		vector<vector<int> > mExonVSIso;
		int mExonUB;
		int mIsoUB;

		int mTotalIso;
		int mTotalGene;
		int mTotalExon;
		double mTotalExonReadCnt;
		double mTotalJuncReadCnt;
		vector<double> mExpOfGenes;
		int mHighCnt;
		double mWeightedIsoLen;
		int mFullyContained;

		int mTSSCnt;
		int mPASCnt;
		int mTSS_PASCnt;
		vector<int> mExonExpDist;

}; /* -----  end of class InstanceStatistics  ----- */


#endif
