/*-- Copyright 2001, 2002 Elliotte Rusty Harold. All rights reserved. This file is part of XIncluder, a Java class library for integrating XInclude processing with SAX, DOM, and JDOM. XIncluder is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 2.1 as published by the Free Software Foundation. XIncluder is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with XIncluder; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ELLIOTTE RUSTY HAROLD OR ANY OTHER CONTRIBUTORS TO THIS PACKAGE BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package net.sf.tomp.xml.include; import java.io.IOException; import java.io.InputStream; /** *

* EncodingHeuristics reads from a stream (which should be * buffered) and attempts to guess what the encoding of the text in the stream * is. Byte order marks are stripped from the stream. If it fails to determine * the type of the encoding, it returns the default UTF-8. *

* * @author Elliotte Rusty Harold * @version 1.0d9, July 4, 2002 */ public class EncodingHeuristics { // No instances allowed private EncodingHeuristics() { } /** *

* This utility method ????. *

* * @param in InputStream to read from. * @return String The name of the encoding. * @throws IOException if the stream cannot be reset back to where it was * when the method was invoked. */ public static String readEncodingFromStream(InputStream in) throws IOException { // This may fail if there are a lot of space characters before the end // of the encoding declaration in.mark(1024); try { // lots of things can go wrong here. If any do, I just return null // so that we'll fall back on the encoding declaration or the // UTF-8 default int byte1 = in.read(); int byte2 = in.read(); if ((byte1 == 0xFE) && (byte2 == 0xFF)) { // don't reset because the byte order mark should not be // included???? return "UnicodeBig"; // name for big-endian???? } else if ((byte1 == 0xFF) && (byte2 == 0xFE)) { // don't reset because the byte order mark should not be // included???? // will the reader throw away the byte order mark or will it // return it???? return "UnicodeLittle"; } /* * In accordance with the Character Model [Character Model], when * the text format is a Unicode encoding, the XInclude processor * must fail the inclusion when the text in the selected range is * non-normalized. When transcoding characters to a Unicode encoding * from a legacy encoding, a normalizing transcoder must be used. */ int byte3 = in.read(); // check for UTF-8 byte order mark if ((byte1 == 0xEF) && (byte2 == 0xBB) && (byte3 == 0xBF)) { // don't reset because the byte order mark should not be // included???? // in general what happens if text document includes non-XML // legal chars???? return "UTF-8"; } int byte4 = in.read(); if ((byte1 == 0x00) && (byte2 == 0x00) && (byte3 == 0xFE) && (byte4 == 0xFF)) { // don't reset because the byte order mark should not be // included???? return "UCS-4"; // right name for big-endian UCS-4 in Java // 1.4???? } else if ((byte1 == 0x00) && (byte2 == 0x00) && (byte3 == 0xFF) && (byte4 == 0xFE)) { // don't reset because the byte order mark should not be // included???? return "UCS-4"; // right name for little-endian UCS-4 in Java // 1.4???? } // no byte order mark present; first character must be // less than sign or white space // Let's look for less-than signs first if ((byte1 == 0x00) && (byte2 == 0x00) && (byte3 == 0x00) && (byte4 == '<')) { in.reset(); return "UCS-4"; // right name for big-endian UCS-4 in Java // 1.4???? } else if ((byte1 == '<') && (byte2 == 0x00) && (byte3 == 0x00) && (byte4 == 0x00)) { in.reset(); return "UCS-4"; // right name for little-endian UCS-4 in Java // 1.4???? } else if ((byte1 == 0x00) && (byte2 == '<') && (byte3 == 0x00) && (byte4 == '?')) { in.reset(); return "UnicodeBigUnmarked"; } else if ((byte1 == '<') && (byte2 == 0x00) && (byte3 == '?') && (byte4 == 0x00)) { in.reset(); return "UnicodeLittleUnmarked"; } else if ((byte1 == '<') && (byte2 == '?') && (byte3 == 'x') && (byte4 == 'm')) { // ASCII compatible, must read encoding declaration // 1024 bytes will be far enough to read most XML declarations byte[] data = new byte[1024]; data[0] = (byte) byte1; data[1] = (byte) byte2; data[2] = (byte) byte3; data[3] = (byte) byte4; int length = in.read(data, 4, 1020) + 4; // Use Latin-1 (ISO-8859-1) because it's ASCII compatible and // all byte sequences are legal Latin-1 sequences so I don't // have // to worry about encoding errors if I slip past the // end of the XML/text declaration String declaration = new String(data, 0, length, "8859_1"); // if any of these throw a StringIndexOutOfBoundsException // we just fall into the catch bloclk and return null // since this can't be well-formed XML int position = declaration.indexOf("encoding") + 8; char c; // get rid of white space before equals sign while (true) { c = declaration.charAt(position++); if ((c != ' ') && (c != '\t') && (c != '\r') && (c != '\n')) { break; } } if (c != '=') { // malformed in.reset(); return "UTF-8"; } // get rid of white space after equals sign while (true) { c = declaration.charAt(position++); if ((c != ' ') && (c != '\t') && (c != '\r') && (c != '\n')) { break; } } char delimiter = c; if ((delimiter != '\'') && (delimiter != '"')) { // malformed in.reset(); return "UTF-8"; } // now positioned to read encoding name StringBuffer encodingName = new StringBuffer(); while (true) { c = declaration.charAt(position++); if (c == delimiter) { break; } encodingName.append(c); } in.reset(); return encodingName.toString(); } else if ((byte1 == 0x4C) && (byte2 == 0x6F) && (byte3 == 0xA7) && (byte4 == 0x94)) { // EBCDIC compatible, must read encoding declaration // ???? } } catch (Exception e) { in.reset(); return "UTF-8"; } // no XML or text declaration present in.reset(); return "UTF-8"; } } /* * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with the * License. You may obtain a copy of the License at http://www.mozilla.org/MPL/ * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for * the specific language governing rights and limitations under the License. The * Original Code is: all this file. The Initial Developer of the Original Code * is: Tomas Pitner, Masaryk University in Brno, Czech Republic. Contributor(s): */