import java.lang.*;
import java.io.*;
import java.util.*;

//===================================================================================================
// DataFormat.java - this file contains the class for storing data and operations on it
// In effect this file handles data storage and manipulation
//===================================================================================================

public class DataFormat {

	int numInstances;         // number of instances - self explanatory
	int numAttribs;           // number of attributes for each instance - self explanatory
	String nameAttribLearn;	  // name of the attribute to be learned
	Vector attribData;        // information about each attribute - a vector of Class AttribDetails
	Vector classifierData;   // information about each attribute - a vector of Class AttribDetails

	PushbackReader dataPushbackRd; // PushBack Reader for the data file


	//--------------------------------------------------------------------------
	// Constructor for Data Format -- initializes the various structures in it
	//--------------------------------------------------------------------------

	public DataFormat() {
		numInstances = 0;
            numAttribs = 0;
		nameAttribLearn = new String();
		attribData      = new Vector();
		classifierData = new Vector();
		System.out.println("Created Data Format");
	} // DataFormat::DataFormat


	//--------------------------------------------------------------------------
	// readDataFile - read in data from the file
	//--------------------------------------------------------------------------

	public int readDataFile(String dataFileName) {
		System.out.println("File name received: " + dataFileName + "\n");

		// create a file and a filereader
		File dataFile = new File( dataFileName);
		if( !dataFile.exists()) {
			System.out.println("Unable to open Data File " + dataFileName + ". It may not exist.\n");
			return 0;
		} // if data file does not exist

		FileReader dataFileRd;	
		try {
			dataFileRd = new FileReader( dataFile);
		}catch ( FileNotFoundException eFileNotFoundOpen) { 
			System.out.println("FileNotFoundException while opening File " + dataFileName);
			return 0;
		} //catch FileNotFoundException
	
		dataPushbackRd = new PushbackReader(  dataFileRd, ConstantValues.FILE_READ_PUSHBACK_SIZE);

		boolean fileOver = false;
		String lineFromFile = new String();

		// read the first line - it contains the number of instances, attributes and the attribute to be learnt
		Vector retValueNextLine = new Vector();
            retValueNextLine = getNextLine();
		lineFromFile = (String)retValueNextLine.elementAt(0);
		fileOver = ((Boolean)retValueNextLine.elementAt(1)).booleanValue();

            if( fileOver == true) {
			System.out.println("Abrupt end of file :" + dataFileName + ". File contains only the header.\n");
			return 0;
		} // no more lines -- error in data file

		StringTokenizer attribSeparator = new StringTokenizer( lineFromFile, ConstantValues.TOKEN_SEPARATORS, false);
	
		try {
			numInstances    = Integer.parseInt(attribSeparator.nextToken().trim());
			numAttribs      = Integer.parseInt(attribSeparator.nextToken().trim());
			nameAttribLearn = attribSeparator.nextToken().trim();
			System.out.println("#Instances : " + numInstances + "\n" + "#Attributes : " + numAttribs + "\n" + 
				"LearnAttrib : " + nameAttribLearn + "\n");
		} catch (NoSuchElementException eNoSuchElement) {
			System.out.println("Abrupt end of file :" + dataFileName + ". File header contains fewer entries than expected.\n");
			return 0;
		} // catch NoSuchElementException 



		// read the next "number of attributes" lines - these provide information about each attribute

		AttribDetails attribDataVectorEntry; // a temporary value holder for newly read data - only used to insert elements into the vector

		for( int countAttrib = 0; countAttrib < numAttribs; countAttrib++) {
        		retValueNextLine = getNextLine();
			lineFromFile = (String)retValueNextLine.elementAt(0);
			fileOver = ((Boolean)retValueNextLine.elementAt(1)).booleanValue();
			attribSeparator = new StringTokenizer( lineFromFile, ConstantValues.TOKEN_SEPARATORS, false);

       	     	if( fileOver == true) {
				System.out.println("Abrupt end of file :" + dataFileName + ". insufficient number of attribute details.\n");
				return 0;
			} // no more lines -- error in data file

			attribDataVectorEntry = new AttribDetails();

			try {
				// read the name, type and datatype of the attribute
				attribDataVectorEntry.attribName         = attribSeparator.nextToken().trim();
				attribDataVectorEntry.attribDataTypeStr  = attribSeparator.nextToken().trim();
				attribDataVectorEntry.attribTypeStr      = attribSeparator.nextToken().trim();

				// set the datatype of the attribute from string to integer (for easier comparison later on)
				if( attribDataVectorEntry.attribDataTypeStr.equalsIgnoreCase( ConstantValues.DATATYPE_REAL_STR)) {
					attribDataVectorEntry.attribDataType  = ConstantValues.DATATYPE_REAL;
				} // datatype is REAL
				else if( attribDataVectorEntry.attribDataTypeStr.equalsIgnoreCase( ConstantValues.DATATYPE_INTEGER_STR)) {
					attribDataVectorEntry.attribDataType  = ConstantValues.DATATYPE_INTEGER;
				} // datatype is INTEGER
				else if( attribDataVectorEntry.attribDataTypeStr.equalsIgnoreCase( ConstantValues.DATATYPE_SYMBOLIC_STR)) {
					attribDataVectorEntry.attribDataType  = ConstantValues.DATATYPE_SYMBOLIC;
				} // datatype is SYMBOLIC
				else { 
					System.out.println("Invalid datatype for attribute :" + (countAttrib+1) + ".\n");
					return 0;
				} // invalid value type

				// set the type of the attribute from string to integer (for easier comparison later on)
				if( attribDataVectorEntry.attribTypeStr.equalsIgnoreCase( ConstantValues.VAL_CONTINUOUS_STR)) {
					attribDataVectorEntry.attribType      = ConstantValues.VAL_CONTINUOUS;
				} // value is continuous
				else if( attribDataVectorEntry.attribTypeStr.equalsIgnoreCase( ConstantValues.VAL_DISCRETE_STR)) {
					attribDataVectorEntry.attribType      = ConstantValues.VAL_DISCRETE;
				} // value is discrete
				else { 
					System.out.println("Invalid type for attribute :" + (countAttrib+1) + ".\n");
					return 0;
				} // invalid value type
					
				if( attribDataVectorEntry.attribType == ConstantValues.VAL_DISCRETE) {
					if( attribDataVectorEntry.attribDataType == ConstantValues.DATATYPE_SYMBOLIC) {
						while( attribSeparator.hasMoreTokens())
							attribDataVectorEntry.attribCategories.addElement(attribSeparator.nextToken().trim());
					} // symbolic discrete data
					else {
						while( attribSeparator.hasMoreTokens())
							attribDataVectorEntry.attribCategories.addElement(Integer.valueOf(attribSeparator.nextToken().trim()));
					} // integer discrete data
				} // processing for discrete attributes
				else {
					if( attribDataVectorEntry.attribDataType == ConstantValues.DATATYPE_INTEGER) {
						attribDataVectorEntry.attribRange.addElement(Integer.valueOf(attribSeparator.nextToken().trim()));
						attribDataVectorEntry.attribRange.addElement(Integer.valueOf(attribSeparator.nextToken().trim()));
					} // integer continuous data
					else {
						attribDataVectorEntry.attribRange.addElement(Double.valueOf( attribSeparator.nextToken().trim()));
						attribDataVectorEntry.attribRange.addElement(Double.valueOf( attribSeparator.nextToken().trim()));
					} // real continuous data
				} // processing for continuous attributes
			} catch (NoSuchElementException eNoSuchElement) {
				System.out.println("Abrupt end of file :" + dataFileName + ". Attrib header " + (countAttrib+1) + 
					"contains fewer entries than expected.\n");
				return 0;
			} // catch NoSuchElementException
			
			attribData.addElement( attribDataVectorEntry);	// add the completed attributed header into the main structure		
		} // for countAttrib -- for each attribute



		// read the next "number of instances" lines - these provide the values of each attribute

		String tempAttribVal; // stores the value of an attribute till it is inserted into its position

		for( int countInstance = 0; countInstance < numInstances; countInstance++) {
        		retValueNextLine = getNextLine();
			lineFromFile = (String)retValueNextLine.elementAt(0);
			fileOver = ((Boolean)retValueNextLine.elementAt(1)).booleanValue();
			attribSeparator = new StringTokenizer( lineFromFile, ConstantValues.TOKEN_SEPARATORS, false);

       	     	if( fileOver == true) {
				System.out.println("Abrupt end of file :" + dataFileName + ". insufficient number of instances.\n");
				return 0;
			} // no more lines -- error in data file

			try {
				for( int countAttrib = 0; countAttrib < numAttribs; countAttrib++) {
					tempAttribVal = attribSeparator.nextToken().trim();
					if( ((AttribDetails)(attribData.elementAt(countAttrib))).attribDataType == ConstantValues.DATATYPE_INTEGER) {
						((AttribDetails)(attribData.elementAt(countAttrib))).attribValues.addElement( Integer.valueOf( tempAttribVal));
					} // attribute type is integer
					else if( ((AttribDetails)(attribData.elementAt(countAttrib))).attribDataType == ConstantValues.DATATYPE_REAL) {
						((AttribDetails)(attribData.elementAt(countAttrib))).attribValues.addElement( Double.valueOf( tempAttribVal));
					} // attribute type is real
					else {
 						((AttribDetails)(attribData.elementAt(countAttrib))).attribValues.addElement( tempAttribVal);
					} // attribute type is symbolic
				} // for countAttrib -- each attribute
			} catch (NoSuchElementException eNoSuchElement) {
				System.out.println("Abrupt end of file :" + dataFileName + ". Attrib header " + (countInstance+1) + 
					"contains fewer entries than expected.\n");
				return 0;
			} // catch NoSuchElementException
		} // for countInstance -- for each instance

		// close the filereader and the file
		try {
			dataPushbackRd.close();
			dataFileRd.close();
		}catch ( IOException eIOProblemClose) { 
			System.out.println("IOException while closing File " + dataFileName);
			return 0;
		} //catch FileNotFoundException

		return numInstances; // if zero instances returned, there's a problem with reading the file
	} // DataFormat::readDataFile


	//--------------------------------------------------------------------
	// getNextNonCommentLine -- reads the next line from the
	// pushback reader and provides it. 
	//---------------------------------------------------------------------

	Vector getNextLine() {

		boolean fileOver = false;
		boolean readAnotherLine = true;

		String tempLine = new String();


		try {
			while( readAnotherLine) {
				char tempBuf[] = new char[ConstantValues.FILE_READ_PUSHBACK_SIZE];
				int numCharsRead =  dataPushbackRd.read( tempBuf, 0, ConstantValues.FILE_READ_PUSHBACK_SIZE);
			
				if( numCharsRead == -1) { // end of input
					fileOver = true;
					break;
				} // no more lines in the file

				String tempStr = new String( tempBuf);
	//			System.out.println("[tempStr: " + tempStr + "]");

				int lineEndIndex = tempStr.indexOf('\n');

				if( lineEndIndex != -1) { // ensure that '\n' exists in the string
					dataPushbackRd.unread( tempBuf, lineEndIndex + 1, tempStr.length() - lineEndIndex - 1);
					tempLine = new String( tempBuf, 0, lineEndIndex + 1);
				}
				else {
					tempLine = new String( tempBuf, 0, numCharsRead);
				}

				tempLine = tempLine.trim();
	//			System.out.println("[tempLine :" + tempLine + "]");

				readAnotherLine = false;
			} // while readAnotherLine
		}  // end of try block
		
		catch( IOException e) {
			System.out.println("IOException while reading from PushbackReader");
			java.lang.System.exit(1);
		} // caught IOException

		Boolean retFileOver = new Boolean( fileOver);
		Vector retValue = new Vector();
		retValue.addElement( tempLine);
		retValue.addElement( retFileOver);
		return retValue;
	} // DataFormat::getNextLine


	//--------------------------------------------------------------------------
	// getAttributeNames - provide a visual output of the data
	//--------------------------------------------------------------------------

	public Vector getAttributeNames() {
		Vector listAttribNames = new Vector();
		for( int countAttrib = 0; countAttrib < numAttribs; countAttrib++) {
			listAttribNames.addElement(((AttribDetails)(attribData.elementAt(countAttrib))).attribName);
		} // for countAttrib
		return listAttribNames;
	} // DataFormat::getAttributeNames


	//--------------------------------------------------------------------------
	// displayData - provide a visual output of the data
	//--------------------------------------------------------------------------

	public void displayData() {
	} // DataFormat::displayData


	//--------------------------------------------------------------------------
	// calcStdDev - returns the standard deviation for the specified attribute or
      // the value NOT_A_VALID_TYPE
	//--------------------------------------------------------------------------

	public double calcStdDev(String attribName) {
		
		int attribIndex = ConstantValues.NOT_A_VALID_TYPE; // zero-based index of the attribute
		int countAttrib; // counter for the loop
		for( countAttrib = 0; countAttrib < numAttribs; countAttrib++) {
			if( attribName.equals(((AttribDetails)(attribData.elementAt(countAttrib))).attribName)) {
				attribIndex = countAttrib;
				break;
			} // found index of attribute
		} // for countAttrib

		if( countAttrib >= numAttribs || attribIndex == ConstantValues.NOT_A_VALID_TYPE) // invalid attribute name
			return (double)ConstantValues.NOT_A_VALID_TYPE;
		else
			return calcStdDev( attribIndex); // calculate Std. Dev. for this attribute index
	} // DataFormat::calcStdDev


	//--------------------------------------------------------------------------
	// calcStdDev - returns the standard deviation for the specified attribute 
      // with zero-based index or the value NOT_A_VALID_TYPE
	//--------------------------------------------------------------------------

	public double calcStdDev(int attribIndex) {
		if((((AttribDetails)(attribData.elementAt(attribIndex))).attribType) != ConstantValues.VAL_CONTINUOUS) {
			return (double)ConstantValues.NOT_A_VALID_TYPE;
		} // mean value for discrete attribute does not make sense
		else {
			double stdDev = 0;			
			double theMean = calcMean( attribIndex);
			double diffMeanSqr = 0;

			for( int countInstance = 0; countInstance < numInstances; countInstance++) {
				if((((AttribDetails)(attribData.elementAt(attribIndex))).attribDataType) == ConstantValues.DATATYPE_REAL) {
					diffMeanSqr += 
					Math.pow((((Double)((AttribDetails)(attribData.elementAt(attribIndex))).attribValues.elementAt(countInstance)).doubleValue()) -
						theMean, 2); // square of the difference
				} // real data 
				else {
					diffMeanSqr += 
					Math.pow((((Integer)((AttribDetails)(attribData.elementAt(attribIndex))).attribValues.elementAt(countInstance)).intValue()) -
						theMean, 2); // square of the difference
				} // integer data
			} // for countInstance

			stdDev = Math.sqrt( diffMeanSqr/(numInstances - 1));
			return stdDev;			
		} // return the std. deviation
	} // DataFormat::calcStdDev


	//--------------------------------------------------------------------------
	// calcMean - returns the standard deviation for the specified attribute 
      // or the value NOT_A_VALID_TYPE
	//--------------------------------------------------------------------------

	public double calcMean(String attribName) {
		
		int attribIndex = ConstantValues.NOT_A_VALID_TYPE; // zero-based index of the attribute
		int countAttrib; // counter for the loop
		for( countAttrib = 0; countAttrib < numAttribs; countAttrib++) {
			if( attribName.equals(((AttribDetails)(attribData.elementAt(countAttrib))).attribName)) {
				attribIndex = countAttrib;
				break;
			} // found index of attribute
		} // for countAttrib

		if( countAttrib >= numAttribs || attribIndex == ConstantValues.NOT_A_VALID_TYPE) // invalid attribute name
			return (double)ConstantValues.NOT_A_VALID_TYPE;
		else
			return calcMean( attribIndex); // calculate the mean value for this attribute index
	} // DataFormat::calcMean


	//--------------------------------------------------------------------------
	// calcMean - returns the standard deviation for the specified attribute 
      // with zero-based index or the value NOT_A_VALID_TYPE
	//--------------------------------------------------------------------------

	public double calcMean(int attribIndex) {

		if((((AttribDetails)(attribData.elementAt(attribIndex))).attribType) != ConstantValues.VAL_CONTINUOUS) {
			return (double)ConstantValues.NOT_A_VALID_TYPE;
		} // mean value for discrete attribute does not make sense
		else {
			double theMean = 0;
			
			for( int countInstance = 0; countInstance < numInstances; countInstance++) {
				if((((AttribDetails)(attribData.elementAt(attribIndex))).attribDataType) == ConstantValues.DATATYPE_REAL) {
					theMean += (((Double)((AttribDetails)(attribData.elementAt(attribIndex))).attribValues.elementAt(countInstance)).doubleValue());
				} // real data 
				else {
					theMean += (((Integer)((AttribDetails)(attribData.elementAt(attribIndex))).attribValues.elementAt(countInstance)).intValue());
				} // integer data
			} // for countInstance
			theMean /= numInstances;
			return theMean;
		} // return the mean

	
	} // DataFormat::calcMean


	//--------------------------------------------------------------------------
	// getMajorityClassifier - returns the standard deviation for the specified attribute 
      // with zero-based index or the value NOT_A_VALID_TYPE
	//--------------------------------------------------------------------------

	public Classifier getMajorityClassifier() {

		// gather all classifiers and insert them into the vector of classifiers

		int countAttrib = 0; // counter for loop to traverse to the concept to be learnt

		for( countAttrib = 0; countAttrib < numAttribs; countAttrib++) {
			if( nameAttribLearn.equals(((AttribDetails)(attribData.elementAt(countAttrib))).attribName))
				break;
		} // for countAttrib

		for(int countVals = 0; countVals < ((AttribDetails)(attribData.elementAt(countAttrib))).attribCategories.size(); countVals++) {
			Classifier tempClassifier = new Classifier();

			if( ((AttribDetails)(attribData.elementAt(countAttrib))).attribDataType == ConstantValues.DATATYPE_INTEGER) {
				tempClassifier.valClassifier = new String(((Integer)(((AttribDetails)(attribData.elementAt(countAttrib))).attribCategories.elementAt(countVals))).toString()); 
			} // the current attribute has integer values
			else if( ((AttribDetails)(attribData.elementAt(countAttrib))).attribDataType == ConstantValues.DATATYPE_REAL) {
				tempClassifier.valClassifier = new String(((Double)(((AttribDetails)(attribData.elementAt(countAttrib))).attribCategories.elementAt(countVals))).toString()); 
			} // the current attribute has real/double values
 			else {
				tempClassifier.valClassifier = new String(((String)(((AttribDetails)(attribData.elementAt(countAttrib))).attribCategories.elementAt(countVals)))); 
			} // the current attribute has symbolic values
			
			classifierData.addElement( tempClassifier);
		} // for countVals - insert all elements into the classifier Vector


		// cycle through the instances and save the number of occurances of each

		String tempValue;
		for( int countInstance = 0; countInstance < numInstances; countInstance++) {
			if( ((AttribDetails)(attribData.elementAt(countAttrib))).attribDataType == ConstantValues.DATATYPE_INTEGER) {
				tempValue = ((Integer)(((AttribDetails)(attribData.elementAt(countAttrib))).attribValues.elementAt(countInstance))).toString(); 
			} // the current attribute has integer values
			else if( ((AttribDetails)(attribData.elementAt(countAttrib))).attribDataType == ConstantValues.DATATYPE_REAL) {
				tempValue = ((Double)(((AttribDetails)(attribData.elementAt(countAttrib))).attribValues.elementAt(countInstance))).toString(); 
			} // the current attribute has real/double values
 			else {
				tempValue = ((String)(((AttribDetails)(attribData.elementAt(countAttrib))).attribValues.elementAt(countInstance))); 
			} // the current attribute has symbolic values

			for( int countVals = 0; countVals < classifierData.size(); countVals++) {
				if(((Classifier)(classifierData.elementAt(countVals))).valClassifier.equals( tempValue)) {
					((Classifier)(classifierData.elementAt(countVals))).numItems++; // increment count by 1
					break;
				} // if strings match
			} // for each classifier
		} // for countInstance -- i.e. each instance


		// fill in the accuracy for all of them

		int majClassifier = 0;
		int numItems = ((Classifier)(classifierData.elementAt(0))).numItems;
		((Classifier)(classifierData.elementAt(0))).accuracy = ((double)(((Classifier)(classifierData.elementAt(0))).numItems))/((double)numInstances);
		for (int countClassifiers = 1; countClassifiers < classifierData.size(); countClassifiers++) {

			((Classifier)(classifierData.elementAt(countClassifiers))).accuracy = 
				((double)(((Classifier)(classifierData.elementAt(countClassifiers))).numItems))/((double)numInstances);
			if( ((Classifier)(classifierData.elementAt(countClassifiers))).numItems > numItems) {
				numItems = ((Classifier)(classifierData.elementAt(countClassifiers))).numItems;
				majClassifier = countClassifiers;
			} // new MajorClassifier Found
		} // for countClassifiers
	
	System.out.println("found the majority classifier");
	
		return ((Classifier)(classifierData.elementAt(majClassifier)));
	
	} // DataFormat::getMajorityClassifier


} // DataFormat

