ExcelCSVParser Javadoc/*
 * Copyright (C) 2001-2010 Stephen Ostermiller
 * http://ostermiller.org/contact.pl?regarding=Java+Utilities
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * See LICENSE.txt for details.
 */
package com.Ostermiller.util;
import java.io.*;
import java.util.Vector;
/**
 * Read files in comma separated value format as outputted by the Microsoft
 * Excel Spreadsheet program.
 * More information about this class is available from <a target="_top" href=
 * "http://ostermiller.org/utils/CSV.html">ostermiller.org</a>.
 * <P>
 * Excel CSV is a file format used as a portable representation of a database. The file
 * format is described by RFC 4180.
 * <P>
 * Each line is one entry or record and the fields in a record are separated by commas.
 * If field includes a comma or a new line, the whole field must be surrounded with double quotes.
 * When the field is in quotes, any quote literals must be escaped by two quotes ("").
 * Text that comes after quotes that have been closed but come before the next comma will be ignored.
 * <P>
 * Empty fields are returned as as String of length zero: "". The following line has three empty
 * fields and three non-empty fields in it. There is an empty field on each end, and one in the
 * middle. One token is returned as a space.<br>
 * <pre>,second,, ,fifth,</pre>
 * <P>
 * Blank lines are always ignored.	Other lines will be ignored if they start with a
 * comment character as set by the setCommentStart() method.
 * <P>
 * An example of how ExcelCSVParser might be used:
 * <pre>
 * ExcelCSVParser shredder = new ExcelCSVParser(System.in);
 * String t;
 * while ((t = shredder.nextValue()) != null){
 *     System.out.println("" + shredder.lastLineNumber() + " " + t);
 * }
 * </pre>
 * <P>
 * The CSV that Excel outputs differs the format read by com.Ostermiller.util.CSVParser:
 * <ul><li>Leading and trailing whitespace is significant.</li>
 * <li>A backslash is not a special character and is not used to escape anything.</li>
 * <li>Quotes inside quoted strings are escaped with a double quote rather than a backslash.</li>
 * <li>Excel may convert data before putting it in CSV format:<ul>
 * <li>Tabs are converted to a single space.</li>
 * <li>New lines in the data are always represented as the UNIX new line. ("\n")</li>
 * <li>Numbers that are greater than 12 digits may be represented in truncated
 * scientific notation form.</li></ul>
 * This parser does not attempt to fix these Excel conversions, but users should be aware
 * of them.</li></ul>
 *
 * @see com.Ostermiller.util.CSVParser
 *
 * @author Stephen Ostermiller http://ostermiller.org/contact.pl?regarding=Java+Utilities
 * @since ostermillerutils 1.00.00
 */
public class ExcelCSVParser implements CSVParse {
	/**
	 * InputStream on which this parser is based.
	 *
	 * @since ostermillerutils 1.02.22
	 */
	private InputStream inStream;
	/**
	 * Reader on which this parser is based.
	 *
	 * @since ostermillerutils 1.02.22
	 */
	private Reader inReader;
	/**
	 * Does all the dirty work.
	 * Calls for new tokens are routed through
	 * this object.
	 *
	 * @since ostermillerutils 1.00.00
	 */
	private ExcelCSVLexer lexer;
	/**
	 * Token cache. Used for when we request a token
	 * from the lexer but can't return it because its
	 * on the next line.
	 *
	 * @since ostermillerutils 1.00.00
	 */
	private String tokenCache;
	/**
	 * Line cache.	The line number that goes along with
	 * the tokenCache.	Not valid if the tokenCache is
	 * null.
	 *
	 * @since ostermillerutils 1.00.00
	 */
	private int lineCache;
	/**
	 * The line number the last token came from, or -1 if
	 * no tokens have been returned.
	 *
	 * @since ostermillerutils 1.00.00
	 */
	private int lastLine = -1;
	/**
	 * Create a parser to parse delimited values from
	 * an InputStream.
	 *
	 * @param in stream that contains comma separated values.
	 * @param delimiter record separator
	 *
	 * @throws BadDelimiterException if the specified delimiter cannot be used
	 *
	 * @since ostermillerutils 1.02.24
	 */
	public ExcelCSVParser(InputStream in, char delimiter) throws BadDelimiterException {
		inStream = in;
		lexer = new ExcelCSVLexer(in);
		changeDelimiter(delimiter);
	}
	/**
	 * Create a parser to parse comma separated values from
	 * an InputStream.
	 *
	 * @param in stream that contains comma separated values.
	 *
	 * @since ostermillerutils 1.00.00
	 */
	public ExcelCSVParser(InputStream in){
		inStream = in;
		lexer = new ExcelCSVLexer(in);
	}
	/**
	 * Create a parser to parse delimited values from
	 * a Reader.
	 *
	 * @param in reader that contains comma separated values.
	 * @param delimiter record separator
	 *
	 * @throws BadDelimiterException if the specified delimiter cannot be used
	 *
	 * @since ostermillerutils 1.02.24
	 */
	public ExcelCSVParser(Reader in, char delimiter) throws BadDelimiterException {
		inReader = in;
		lexer = new ExcelCSVLexer(in);
		changeDelimiter(delimiter);
	}
	/**
	 * Create a parser to parse comma separated values from
	 * a Reader.
	 *
	 * @param in reader that contains comma separated values.
	 *
	 * @since ostermillerutils 1.00.00
	 */
	public ExcelCSVParser(Reader in){
		inReader = in;
		lexer = new ExcelCSVLexer(in);
	}
	/**
	 * Close any stream upon which this parser is based.
	 *
	 * @since ostermillerutils 1.02.22
	 * @throws IOException if an error occurs while closing the stream.
	 */
	public void close() throws IOException {
		if (inStream != null) inStream.close();
		if (inReader != null) inReader.close();
	}
	/**
	 * get the next value.
	 *
	 * @return the next value or null if there are no more values.
	 * @throws IOException if an error occurs while reading.
	 *
	 * @since ostermillerutils 1.00.00
	 */
	public String nextValue() throws IOException {
		if (tokenCache == null){
			tokenCache = lexer.getNextToken();
			lineCache = lexer.getLineNumber();
		}
		lastLine = lineCache;
		String result = tokenCache;
		tokenCache = null;
		return result;
	}
	/**
	 * Get the line number that the last token came from.
	 * <p>
	 * New line breaks that occur in the middle of a token are no
	 * counted in the line number count.
	 *
	 * @return line number or -1 if no tokens have been returned yet.
	 *
	 * @since ostermillerutils 1.00.00
	 */
	public int lastLineNumber(){
		return lastLine;
	}
	/**
	 * Get all the values from a line.
	 * <p>
	 * If the line has already been partially read, only the
	 * values that have not already been read will be included.
	 *
	 * @return all the values from the line or null if there are no more values.
	 * @throws IOException if an error occurs while reading.
	 *
	 * @since ostermillerutils 1.00.00
	 */
	public String[] getLine() throws IOException{
		int lineNumber = -1;
		Vector<String> v = new Vector<String>();
		if (tokenCache != null){
			v.add(tokenCache);
			lineNumber = lineCache;
		}
		while ((tokenCache = lexer.getNextToken()) != null
				&& (lineNumber == -1 || lexer.getLineNumber() == lineNumber)){
			v.add(tokenCache);
			lineNumber = lexer.getLineNumber();
		}
		if (v.size() == 0){
			return null;
		}
		lastLine = lineNumber;
		lineCache = lexer.getLineNumber();
		String[] result = new String[v.size()];
		return (v.toArray(result));
	}
	/**
	 * Get all the values from the file.
	 * <p>
	 * If the file has already been partially read, only the
	 * values that have not already been read will be included.
	 * <p>
	 * Each line of the file that has at least one value will be
	 * represented. Comments and empty lines are ignored.
	 * <p>
	 * The resulting double array may be jagged.
	 *
	 * @return all the values from the file or null if there are no more values.
	 * @throws IOException if an error occurs while reading.
	 *
	 * @since ostermillerutils 1.00.00
	 */
	public String[][] getAllValues() throws IOException {
		Vector<String[]> v = new Vector<String[]>();
		String[] line;
		while((line = getLine()) != null){
			v.add(line);
		}
		if (v.size() == 0){
			return null;
		}
		String[][] result = new String[v.size()][];
		return (v.toArray(result));
	}
	/**
	 * Change this parser so that it uses a new delimiter.
	 * <p>
	 * The initial character is a comma, the delimiter cannot be changed
	 * to a quote or other character that has special meaning in CSV.
	 *
	 * @param newDelim delimiter to which to switch.
	 * @throws BadDelimiterException if the character cannot be used as a delimiter.
	 *
	 * @since ostermillerutils 1.02.08
	 */
	public void changeDelimiter(char newDelim) throws BadDelimiterException {
		lexer.changeDelimiter(newDelim);
	}
	/**
	 * Change this parser so that it uses a new character for quoting.
	 * <p>
	 * The initial character is a double quote ("), the delimiter cannot be changed
	 * to a comma or other character that has special meaning in CSV.
	 *
	 * @param newQuote character to use for quoting.
	 * @throws BadQuoteException if the character cannot be used as a quote.
	 *
	 * @since ostermillerutils 1.02.16
	 */
	public void changeQuote(char newQuote) throws BadQuoteException {
		lexer.changeQuote(newQuote);
	}
	/**
	 * Set the characters that indicate a comment at the beginning of the line.
	 * For example if the string "#;!" were passed in, all of the following lines
	 * would be comments:<br>
	 * <pre> # Comment
	 * ; Another Comment
	 * ! Yet another comment</pre>
	 * By default there are no comments in CVS files. Commas and quotes may not be
	 * used to indicate comment lines.
	 *
	 * @param commentDelims list of characters a comment line may start with.
	 *
	 * @since ostermillerutils 1.00.00
	 */
	public void setCommentStart(String commentDelims){
		lexer.setCommentStart(commentDelims);
	}
	/**
	 * Get the number of the line from which the last value was retrieved.
	 *
	 * @return line number or -1 if no tokens have been returned.
	 *
	 * @since ostermillerutils 1.00.00
	 */
	public int getLastLineNumber(){
		return lastLine;
	}
	/**
	 * Parse the comma delimited data from a string.
	 *
	 * @param s string with comma delimited data to parse.
	 * @return parsed data.
	 *
	 * @since ostermillerutils 1.02.03
	 */
	public static String[][] parse(String s){
		try {
			return (new ExcelCSVParser(new StringReader(s))).getAllValues();
		} catch (IOException x){
			return null;
		}
	}
	/**
	 * Parse the delimited data from a string.
	 *
	 * @param s string with delimited data to parse.
	 * @param delimiter record separator
	 * @return parsed data.
	 * @throws BadDelimiterException if the character cannot be used as a delimiter.
	 *
	 * @since ostermillerutils 1.02.24
	 */
	public static String[][] parse(String s, char delimiter) throws BadDelimiterException {
		try {
			return (new ExcelCSVParser(new StringReader(s), delimiter)).getAllValues();
		} catch (IOException x){
			return null;
		}
	}
	/**
	 * Parse the comma delimited data from a stream.
	 *
	 * @param in Reader with comma delimited data to parse.
	 * @return parsed data.
	 * @throws IOException if an error occurs while reading.
	 *
	 * @since ostermillerutils 1.02.03
	 */
	public static String[][] parse(Reader in) throws IOException {
		return (new ExcelCSVParser(in)).getAllValues();
	}
	/**
	 * Parse the delimited data from a stream.
	 *
	 * @param in Reader with delimited data to parse.
	 * @param delimiter record separator
	 * @return parsed data.
	 * @throws BadDelimiterException if the character cannot be used as a delimiter.
	 * @throws IOException if an error occurs while reading.
	 *
	 * @since ostermillerutils 1.02.24
	 */
	public static String[][] parse(Reader in, char delimiter) throws IOException, BadDelimiterException {
		return (new ExcelCSVParser(in, delimiter)).getAllValues();
	}
}