ExcelCSVParser Javadoc/*
* Copyright (C) 2001-2010 Stephen Ostermiller
* http://ostermiller.org/contact.pl?regarding=Java+Utilities
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* See LICENSE.txt for details.
*/
package com.Ostermiller.util;
import java.io.*;
import java.util.Vector;
/**
* Read files in comma separated value format as outputted by the Microsoft
* Excel Spreadsheet program.
* More information about this class is available from <a target="_top" href=
* "http://ostermiller.org/utils/CSV.html">ostermiller.org</a>.
* <P>
* Excel CSV is a file format used as a portable representation of a database. The file
* format is described by RFC 4180.
* <P>
* Each line is one entry or record and the fields in a record are separated by commas.
* If field includes a comma or a new line, the whole field must be surrounded with double quotes.
* When the field is in quotes, any quote literals must be escaped by two quotes ("").
* Text that comes after quotes that have been closed but come before the next comma will be ignored.
* <P>
* Empty fields are returned as as String of length zero: "". The following line has three empty
* fields and three non-empty fields in it. There is an empty field on each end, and one in the
* middle. One token is returned as a space.<br>
* <pre>,second,, ,fifth,</pre>
* <P>
* Blank lines are always ignored. Other lines will be ignored if they start with a
* comment character as set by the setCommentStart() method.
* <P>
* An example of how ExcelCSVParser might be used:
* <pre>
* ExcelCSVParser shredder = new ExcelCSVParser(System.in);
* String t;
* while ((t = shredder.nextValue()) != null){
* System.out.println("" + shredder.lastLineNumber() + " " + t);
* }
* </pre>
* <P>
* The CSV that Excel outputs differs the format read by com.Ostermiller.util.CSVParser:
* <ul><li>Leading and trailing whitespace is significant.</li>
* <li>A backslash is not a special character and is not used to escape anything.</li>
* <li>Quotes inside quoted strings are escaped with a double quote rather than a backslash.</li>
* <li>Excel may convert data before putting it in CSV format:<ul>
* <li>Tabs are converted to a single space.</li>
* <li>New lines in the data are always represented as the UNIX new line. ("\n")</li>
* <li>Numbers that are greater than 12 digits may be represented in truncated
* scientific notation form.</li></ul>
* This parser does not attempt to fix these Excel conversions, but users should be aware
* of them.</li></ul>
*
* @see com.Ostermiller.util.CSVParser
*
* @author Stephen Ostermiller http://ostermiller.org/contact.pl?regarding=Java+Utilities
* @since ostermillerutils 1.00.00
*/
public class ExcelCSVParser implements CSVParse {
/**
* InputStream on which this parser is based.
*
* @since ostermillerutils 1.02.22
*/
private InputStream inStream;
/**
* Reader on which this parser is based.
*
* @since ostermillerutils 1.02.22
*/
private Reader inReader;
/**
* Does all the dirty work.
* Calls for new tokens are routed through
* this object.
*
* @since ostermillerutils 1.00.00
*/
private ExcelCSVLexer lexer;
/**
* Token cache. Used for when we request a token
* from the lexer but can't return it because its
* on the next line.
*
* @since ostermillerutils 1.00.00
*/
private String tokenCache;
/**
* Line cache. The line number that goes along with
* the tokenCache. Not valid if the tokenCache is
* null.
*
* @since ostermillerutils 1.00.00
*/
private int lineCache;
/**
* The line number the last token came from, or -1 if
* no tokens have been returned.
*
* @since ostermillerutils 1.00.00
*/
private int lastLine = -1;
/**
* Create a parser to parse delimited values from
* an InputStream.
*
* @param in stream that contains comma separated values.
* @param delimiter record separator
*
* @throws BadDelimiterException if the specified delimiter cannot be used
*
* @since ostermillerutils 1.02.24
*/
public ExcelCSVParser(InputStream in, char delimiter) throws BadDelimiterException {
inStream = in;
lexer = new ExcelCSVLexer(in);
changeDelimiter(delimiter);
}
/**
* Create a parser to parse comma separated values from
* an InputStream.
*
* @param in stream that contains comma separated values.
*
* @since ostermillerutils 1.00.00
*/
public ExcelCSVParser(InputStream in){
inStream = in;
lexer = new ExcelCSVLexer(in);
}
/**
* Create a parser to parse delimited values from
* a Reader.
*
* @param in reader that contains comma separated values.
* @param delimiter record separator
*
* @throws BadDelimiterException if the specified delimiter cannot be used
*
* @since ostermillerutils 1.02.24
*/
public ExcelCSVParser(Reader in, char delimiter) throws BadDelimiterException {
inReader = in;
lexer = new ExcelCSVLexer(in);
changeDelimiter(delimiter);
}
/**
* Create a parser to parse comma separated values from
* a Reader.
*
* @param in reader that contains comma separated values.
*
* @since ostermillerutils 1.00.00
*/
public ExcelCSVParser(Reader in){
inReader = in;
lexer = new ExcelCSVLexer(in);
}
/**
* Close any stream upon which this parser is based.
*
* @since ostermillerutils 1.02.22
* @throws IOException if an error occurs while closing the stream.
*/
public void close() throws IOException {
if (inStream != null) inStream.close();
if (inReader != null) inReader.close();
}
/**
* get the next value.
*
* @return the next value or null if there are no more values.
* @throws IOException if an error occurs while reading.
*
* @since ostermillerutils 1.00.00
*/
public String nextValue() throws IOException {
if (tokenCache == null){
tokenCache = lexer.getNextToken();
lineCache = lexer.getLineNumber();
}
lastLine = lineCache;
String result = tokenCache;
tokenCache = null;
return result;
}
/**
* Get the line number that the last token came from.
* <p>
* New line breaks that occur in the middle of a token are no
* counted in the line number count.
*
* @return line number or -1 if no tokens have been returned yet.
*
* @since ostermillerutils 1.00.00
*/
public int lastLineNumber(){
return lastLine;
}
/**
* Get all the values from a line.
* <p>
* If the line has already been partially read, only the
* values that have not already been read will be included.
*
* @return all the values from the line or null if there are no more values.
* @throws IOException if an error occurs while reading.
*
* @since ostermillerutils 1.00.00
*/
public String[] getLine() throws IOException{
int lineNumber = -1;
Vector<String> v = new Vector<String>();
if (tokenCache != null){
v.add(tokenCache);
lineNumber = lineCache;
}
while ((tokenCache = lexer.getNextToken()) != null
&& (lineNumber == -1 || lexer.getLineNumber() == lineNumber)){
v.add(tokenCache);
lineNumber = lexer.getLineNumber();
}
if (v.size() == 0){
return null;
}
lastLine = lineNumber;
lineCache = lexer.getLineNumber();
String[] result = new String[v.size()];
return (v.toArray(result));
}
/**
* Get all the values from the file.
* <p>
* If the file has already been partially read, only the
* values that have not already been read will be included.
* <p>
* Each line of the file that has at least one value will be
* represented. Comments and empty lines are ignored.
* <p>
* The resulting double array may be jagged.
*
* @return all the values from the file or null if there are no more values.
* @throws IOException if an error occurs while reading.
*
* @since ostermillerutils 1.00.00
*/
public String[][] getAllValues() throws IOException {
Vector<String[]> v = new Vector<String[]>();
String[] line;
while((line = getLine()) != null){
v.add(line);
}
if (v.size() == 0){
return null;
}
String[][] result = new String[v.size()][];
return (v.toArray(result));
}
/**
* Change this parser so that it uses a new delimiter.
* <p>
* The initial character is a comma, the delimiter cannot be changed
* to a quote or other character that has special meaning in CSV.
*
* @param newDelim delimiter to which to switch.
* @throws BadDelimiterException if the character cannot be used as a delimiter.
*
* @since ostermillerutils 1.02.08
*/
public void changeDelimiter(char newDelim) throws BadDelimiterException {
lexer.changeDelimiter(newDelim);
}
/**
* Change this parser so that it uses a new character for quoting.
* <p>
* The initial character is a double quote ("), the delimiter cannot be changed
* to a comma or other character that has special meaning in CSV.
*
* @param newQuote character to use for quoting.
* @throws BadQuoteException if the character cannot be used as a quote.
*
* @since ostermillerutils 1.02.16
*/
public void changeQuote(char newQuote) throws BadQuoteException {
lexer.changeQuote(newQuote);
}
/**
* Set the characters that indicate a comment at the beginning of the line.
* For example if the string "#;!" were passed in, all of the following lines
* would be comments:<br>
* <pre> # Comment
* ; Another Comment
* ! Yet another comment</pre>
* By default there are no comments in CVS files. Commas and quotes may not be
* used to indicate comment lines.
*
* @param commentDelims list of characters a comment line may start with.
*
* @since ostermillerutils 1.00.00
*/
public void setCommentStart(String commentDelims){
lexer.setCommentStart(commentDelims);
}
/**
* Get the number of the line from which the last value was retrieved.
*
* @return line number or -1 if no tokens have been returned.
*
* @since ostermillerutils 1.00.00
*/
public int getLastLineNumber(){
return lastLine;
}
/**
* Parse the comma delimited data from a string.
*
* @param s string with comma delimited data to parse.
* @return parsed data.
*
* @since ostermillerutils 1.02.03
*/
public static String[][] parse(String s){
try {
return (new ExcelCSVParser(new StringReader(s))).getAllValues();
} catch (IOException x){
return null;
}
}
/**
* Parse the delimited data from a string.
*
* @param s string with delimited data to parse.
* @param delimiter record separator
* @return parsed data.
* @throws BadDelimiterException if the character cannot be used as a delimiter.
*
* @since ostermillerutils 1.02.24
*/
public static String[][] parse(String s, char delimiter) throws BadDelimiterException {
try {
return (new ExcelCSVParser(new StringReader(s), delimiter)).getAllValues();
} catch (IOException x){
return null;
}
}
/**
* Parse the comma delimited data from a stream.
*
* @param in Reader with comma delimited data to parse.
* @return parsed data.
* @throws IOException if an error occurs while reading.
*
* @since ostermillerutils 1.02.03
*/
public static String[][] parse(Reader in) throws IOException {
return (new ExcelCSVParser(in)).getAllValues();
}
/**
* Parse the delimited data from a stream.
*
* @param in Reader with delimited data to parse.
* @param delimiter record separator
* @return parsed data.
* @throws BadDelimiterException if the character cannot be used as a delimiter.
* @throws IOException if an error occurs while reading.
*
* @since ostermillerutils 1.02.24
*/
public static String[][] parse(Reader in, char delimiter) throws IOException, BadDelimiterException {
return (new ExcelCSVParser(in, delimiter)).getAllValues();
}
}