/*
 * =====================================================================================
 *
 *       Filename:  RandomExpReadAssignerIM.h
 *
 *    Description:  This is the header file for class RandomExpReadAssignerIM
 *
 *        Version:  1.0
 *        Created:  04/22/2009 11:26:44 AM
 *       Revision:  none
 *       Compiler:  gcc
 *
 *         Author:  Jianxing Feng (feeldead), feeldead@gmail.com
 *        Company:  THU
 *
 * =====================================================================================
 */

#ifndef RandomExpReadAssignerIM_H 
#define RandomExpReadAssignerIM_H

#include <string>
#include <vector>
#include <iostream>
#include "InstanceHandler.h"
#include "Utility.hpp"
#include "ReadInfoBase.hpp"
#include "ReadInfoPE.hpp"

#include <gsl/gsl_cdf.h>
#include <gsl/gsl_rng.h>
#include <gsl/gsl_randist.h>

using namespace std;

/*
 * =====================================================================================
 *        Class:  RandomExpReadAssignerIM
 *  Description:  This is an improvement over RandomExpReadAssigner. This class will
 *                randomly generate expression level and reads over all the genes simultaneously.
 * =====================================================================================
 */
class RandomExpReadAssignerIM : public InstanceHandler
{
	public:
		/* ====================  LIFECYCLE     ======================================= */
		RandomExpReadAssignerIM (ostream* p_output = NULL) : InstanceHandler(p_output)
		{
			mpReadInfo = 0;
			mRng = 0;
		};                             /* constructor */

		virtual ~RandomExpReadAssignerIM (){};                             /* constructor */

		void
		SetRandExpType(int rand_exp_type)
		{
			mRandExpType = rand_exp_type;
		}

		// Set read_len to be -1 such that the exact partial combination extracted from 
		// the known isoforms are used. Otherwise, if read_len > 0 and mSEReadCnt == 0, 
		// random PE reads will be generated.
		void
		SetReadInfo(ReadInfoBase* p_read_info)
		{
			mpReadInfo = p_read_info;
		}

		virtual
		void
		Initialize()
		{
			InstanceHandler::Initialize();

			if (mRng)
				gsl_rng_free(mRng);

			// prepare the random generator
			gsl_rng_env_setup();

			const gsl_rng_type* T = gsl_rng_default;
			mRng = gsl_rng_alloc (T);

			// To check whether the used seed are identical during different run 
			// of the program. Because the only way to set a seed for the generator
			// is to set the environment variable GSL_RNG_SEED
			cerr << "First random number is : " << gsl_rng_uniform(mRng) << endl;
		};

		virtual
		void
		CleanUp()
		{
			if (mpReadInfo)
			{
				GenerateRandomShortReads();
			}

			if (mRng)
				gsl_rng_free(mRng);

			InstanceHandler::CleanUp();
		};

		/*
		 *--------------------------------------------------------------------------------------
		 *        Class:  RandomExpReadAssignerIM
		 *       Method:  OnInstance
		 *  Description:  
		 *        Param: 
		 *       Return:
		 *--------------------------------------------------------------------------------------
		 */
		virtual
		void
		OnInstance(Instance& an_instance)
		{
			if (!mpReadInfo)
			{
				GenerateRandomExp(an_instance);
				InstanceHandler::OnInstance(an_instance);
			}
			else
				mAllInstances.push_back(an_instance);
		}/* -----  end of method OnInstance  ----- */

	protected:

		/*
		 *--------------------------------------------------------------------------------------
		 *        Class:  RandomExpReadAssignerIM
		 *       Method:  GenerateRandomExp
		 *  Description:  Generate random expression levels and single end reads. 
		 *        Param:  There are three types of random expression levels.
		 *                1. pow(10, r), where r is a random variable following standard normal
		 *                    distribution.
		 *                2. pow(2, r), where r ....
		 *                3. r \in [0, 1], uniformly distributed  
		 *       Return:
		 *--------------------------------------------------------------------------------------
		 */
		void
		GenerateRandomExp(Instance& an_instance)
		{
			an_instance.mIsoExp.resize(an_instance.mIsoforms.size());
			for (unsigned i = 0; i < an_instance.mIsoExp.size(); i++)
			{
				double r;
				if (3 == mRandExpType)
				{
					// Generate a random varaible obeying uniform distribution
					r = gsl_rng_uniform(mRng);
				}
				else
				{
					// Generate a random varaible obeying standard normal distribution
					// The expression levels follow a lognormal distribution.
					// Note that even though the expression levels are relative expression levels.
					// pow(10, r) and exp(r) are not equivalent. pow(10, r) will enlarge the
					// difference between expression levels more than exp(r) does.
					r = gsl_ran_gaussian_ziggurat(mRng, 1);
					//r = gsl_ran_ugaussian(mRng);
					if (2 == mRandExpType)
						r = pow(2, r);
					else 
						r = pow(10, r);
				}

				an_instance.mIsoExp[i] = r;
			}
		}

		/*
		 *--------------------------------------------------------------------------------------
		 *        Class:  RandomExpReadAssignerIM
		 *       Method:  GeneratePERead
		 *  Description:  Generate paired end reads. The expression levels in mAllInstances 
		 *                should be initialized before calling this function
		 *        Param: 
		 *       Return:
		 *--------------------------------------------------------------------------------------
		 */
		void
		GenerateRandomShortReads()
		{
			if (mAllInstances.size() == 0) return;

			// According to the expression levels generated by GenerateExpRead,
			// Prepare partial weight sum
			vector<double> gene_wlength_part_sum;
			gene_wlength_part_sum.assign(mAllInstances.size(), 0);

			vector<vector<double> > iso_wlength_part_sum;
			iso_wlength_part_sum.resize(mAllInstances.size());

			vector<vector<int> > iso_len;
			iso_len.resize(mAllInstances.size());

			// Calculate the (weighted) length of each isoform
			for (unsigned ins = 0; ins < mAllInstances.size(); ins++)
			{
				Instance& an_instance = mAllInstances[ins];
				iso_len[ins].resize(an_instance.mIsoforms.size(), 0);
				vector<double>& wlength = iso_wlength_part_sum[ins];
				wlength.assign(an_instance.mIsoforms.size(), 0);
				for (unsigned i = 0; i < an_instance.mIsoforms.size(); i++)
				{
					int length = 0;
					for (unsigned j = 0; j < an_instance.mSegLen.size(); j++)
						length += an_instance.mIsoforms[i][j] * an_instance.mSegLen[j];
					if (length < 0) length = 0;
					iso_len[ins][i] = length;
					wlength[i] = (double)(length * an_instance.mIsoExp[i]);
					if (wlength[i] < 0) wlength[i] = 0;
				}
				for (unsigned i = 1; i < an_instance.mIsoforms.size(); i++)
					wlength[i] += wlength[i-1];

				gene_wlength_part_sum[ins] = wlength[wlength.size()-1];
				if (ins > 0)
					gene_wlength_part_sum[ins] += gene_wlength_part_sum[ins-1];

				// Initialize sample cnt and splice cnt
				int set_size = an_instance.mSegLen.size();
			}

			vector<vector<double> > all_end_at_cnt;
			all_end_at_cnt.resize(mAllInstances.size());

			double tot_gene_wlen = gene_wlength_part_sum[gene_wlength_part_sum.size()-1];

			int cnt = 0;
			while (cnt < mpReadInfo->mTotalReadCnt)
			{
				// Randomly select a gene
				double r = gsl_rng_uniform(mRng) * tot_gene_wlen;
				unsigned gene_idx = UtilityTemp<double>::BinarySearch(gene_wlength_part_sum, r);
				// Find the last gene in case some gene has zero contribution to wlength
				while (gene_idx + 1 < gene_wlength_part_sum.size() && 
					gene_wlength_part_sum[gene_idx] == gene_wlength_part_sum[gene_idx+1])
					gene_idx++;

				// Randomly select an isoform on this gene
				vector<double>& iso_wlen = iso_wlength_part_sum[gene_idx];
				r = gsl_rng_uniform(mRng) * iso_wlen[iso_wlen.size()-1];
				int iso_idx = UtilityTemp<double>::BinarySearch(iso_wlen, r);
				// Find the last iso in case some isoform has zero contribution to wlength
				while (iso_idx + 1 < iso_wlen.size() && iso_wlen[iso_idx] == iso_wlen[iso_idx+1])
					iso_idx++;

				// Note that read_len may be a random variable
				int read_len = (int)mpReadInfo->ReadLen(mRng);
				if (read_len <= 2 * mpReadInfo->mCrossStrength) continue;

				// Randomly select a start position of the read
				int start_pos = (int)(gsl_rng_uniform(mRng) * iso_len[gene_idx][iso_idx]);

				// If the end position of this read is outside of the current isoform
				if (iso_len[gene_idx][iso_idx] < start_pos + read_len) continue;

				if (mpReadInfo->IsSingleEnd())
					OutputSingleEndRead(mAllInstances[gene_idx], iso_idx, start_pos, read_len);
				else
				{
					int end_len = ((ReadInfoPE*)mpReadInfo)->mEndLen;
					if (read_len < 2 * end_len) continue;
					// Output a paired-end read
					OutputSingleEndRead(mAllInstances[gene_idx], iso_idx, start_pos, end_len);
					OutputSingleEndRead(mAllInstances[gene_idx], iso_idx, start_pos + read_len - end_len, end_len);
				}

				cnt++;
				if (cnt % 100000 == 0) 
					cout << cnt << " reads have been sampled" << endl;
			}
		}
	
		// Output a randomly generated single-end read
		void
		OutputSingleEndRead(Instance& selected_instance, int iso_idx, int start_pos, int read_len)
		{
			int set_cnt = selected_instance.mSegLen.size();
			vector<int>& seg_lens = selected_instance.mSegLen;
			vector<bool>& an_iso = selected_instance.mIsoforms[iso_idx];

			vector<int> involved;
			int len = 0;
			int first_len = 0;
			int last_len = 0;
			bool b_first = true;
			for (unsigned i = 0; i < an_iso.size(); ++i)
			{
				if (an_iso[i]) len += seg_lens[i];
				else continue;

				if (len > start_pos)
				{
					if (b_first)
					{
						first_len = start_pos - ( len - seg_lens[i] );
						b_first = false;
					}
					involved.push_back(i);
				}
				if (len >= start_pos + read_len)
				{
					last_len = start_pos + read_len - ( len - seg_lens[i] );
					break;
				}
			}

			// Output the mapping information of this read
			(*mpOutput) << selected_instance.mExons[0].mChr << "\t" << (selected_instance.mExons[0].mStrand ? '+' : '-');

			(*mpOutput) << "\t" << selected_instance.mExons[involved[0]].mStart + first_len;
			for (unsigned i = 1; i < involved.size(); ++i)
				(*mpOutput) << "," << selected_instance.mExons[involved[i]].mStart;

			(*mpOutput) << "\t";
			if (involved.size() > 1)
				for (unsigned i = 0; i < involved.size()-1; ++i)
					(*mpOutput) << selected_instance.mExons[involved[i]].mEnd<< ",";
			(*mpOutput) << selected_instance.mExons[involved[involved.size()-1]].mStart + last_len << endl;
		}


	private:
		gsl_rng * mRng;

		vector<Instance> mAllInstances;

		ReadInfoBase* mpReadInfo;

		int mRandExpType;
}; /* -----  end of class RandomExpReadAssignerIM  ----- */

#endif
