1 /*-- 2 Copyright 2001, 2002 Elliotte Rusty Harold. 3 All rights reserved. 4 This file is part of XIncluder, a Java class library for integrating XInclude 5 processing with SAX, DOM, and JDOM. 6 XIncluder is free software; you can redistribute it and/or modify 7 it under the terms of the GNU Lesser General Public License version 2.1 8 as published by the Free Software Foundation. 9 XIncluder is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU Lesser General Public License for more details. 13 You should have received a copy of the GNU Lesser General Public License 14 along with XIncluder; if not, write to the Free Software 15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED 17 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 DISCLAIMED. IN NO EVENT SHALL ELLIOTTE RUSTY HAROLD OR ANY 20 OTHER CONTRIBUTORS TO THIS PACKAGE 21 BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 24 USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 25 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 27 OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 SUCH DAMAGE. 29 */ 30 package net.sf.tomp.xml.include; 31 32 import java.io.IOException; 33 import java.io.InputStream; 34 35 /*** 36 * <p> 37 * <code>EncodingHeuristics</code> reads from a stream (which should be 38 * buffered) and attempts to guess what the encoding of the text in the stream 39 * is. Byte order marks are stripped from the stream. If it fails to determine 40 * the type of the encoding, it returns the default UTF-8. 41 * </p> 42 * 43 * @author Elliotte Rusty Harold 44 * @version 1.0d9, July 4, 2002 45 */ 46 public class EncodingHeuristics { 47 // No instances allowed 48 private EncodingHeuristics() { 49 } 50 51 /*** 52 * <p> 53 * This utility method ????. 54 * </p> 55 * 56 * @param in <code>InputStream</code> to read from. 57 * @return String The name of the encoding. 58 * @throws IOException if the stream cannot be reset back to where it was 59 * when the method was invoked. 60 */ 61 public static String readEncodingFromStream(InputStream in) 62 throws IOException { 63 // This may fail if there are a lot of space characters before the end 64 // of the encoding declaration 65 in.mark(1024); 66 67 try { 68 // lots of things can go wrong here. If any do, I just return null 69 // so that we'll fall back on the encoding declaration or the 70 // UTF-8 default 71 int byte1 = in.read(); 72 int byte2 = in.read(); 73 74 if ((byte1 == 0xFE) && (byte2 == 0xFF)) { 75 // don't reset because the byte order mark should not be 76 // included???? 77 return "UnicodeBig"; // name for big-endian???? 78 } else if ((byte1 == 0xFF) && (byte2 == 0xFE)) { 79 // don't reset because the byte order mark should not be 80 // included???? 81 // will the reader throw away the byte order mark or will it 82 // return it???? 83 return "UnicodeLittle"; 84 } 85 86 /* 87 * In accordance with the Character Model [Character Model], when 88 * the text format is a Unicode encoding, the XInclude processor 89 * must fail the inclusion when the text in the selected range is 90 * non-normalized. When transcoding characters to a Unicode encoding 91 * from a legacy encoding, a normalizing transcoder must be used. 92 */ 93 int byte3 = in.read(); 94 95 // check for UTF-8 byte order mark 96 if ((byte1 == 0xEF) && (byte2 == 0xBB) && (byte3 == 0xBF)) { 97 // don't reset because the byte order mark should not be 98 // included???? 99 // in general what happens if text document includes non-XML 100 // legal chars???? 101 return "UTF-8"; 102 } 103 104 int byte4 = in.read(); 105 106 if ((byte1 == 0x00) && (byte2 == 0x00) && (byte3 == 0xFE) 107 && (byte4 == 0xFF)) { 108 // don't reset because the byte order mark should not be 109 // included???? 110 return "UCS-4"; // right name for big-endian UCS-4 in Java 111 112 // 1.4???? 113 } else if ((byte1 == 0x00) && (byte2 == 0x00) && (byte3 == 0xFF) 114 && (byte4 == 0xFE)) { 115 // don't reset because the byte order mark should not be 116 // included???? 117 return "UCS-4"; // right name for little-endian UCS-4 in Java 118 119 // 1.4???? 120 } 121 122 // no byte order mark present; first character must be 123 // less than sign or white space 124 // Let's look for less-than signs first 125 if ((byte1 == 0x00) && (byte2 == 0x00) && (byte3 == 0x00) 126 && (byte4 == '<')) { 127 in.reset(); 128 129 return "UCS-4"; // right name for big-endian UCS-4 in Java 130 131 // 1.4???? 132 } else if ((byte1 == '<') && (byte2 == 0x00) && (byte3 == 0x00) 133 && (byte4 == 0x00)) { 134 in.reset(); 135 136 return "UCS-4"; // right name for little-endian UCS-4 in Java 137 138 // 1.4???? 139 } else if ((byte1 == 0x00) && (byte2 == '<') && (byte3 == 0x00) 140 && (byte4 == '?')) { 141 in.reset(); 142 143 return "UnicodeBigUnmarked"; 144 } else if ((byte1 == '<') && (byte2 == 0x00) && (byte3 == '?') 145 && (byte4 == 0x00)) { 146 in.reset(); 147 148 return "UnicodeLittleUnmarked"; 149 } else if ((byte1 == '<') && (byte2 == '?') && (byte3 == 'x') 150 && (byte4 == 'm')) { 151 // ASCII compatible, must read encoding declaration 152 // 1024 bytes will be far enough to read most XML declarations 153 byte[] data = new byte[1024]; 154 155 data[0] = (byte) byte1; 156 data[1] = (byte) byte2; 157 data[2] = (byte) byte3; 158 data[3] = (byte) byte4; 159 160 int length = in.read(data, 4, 1020) + 4; 161 162 // Use Latin-1 (ISO-8859-1) because it's ASCII compatible and 163 // all byte sequences are legal Latin-1 sequences so I don't 164 // have 165 // to worry about encoding errors if I slip past the 166 // end of the XML/text declaration 167 String declaration = new String(data, 0, length, "8859_1"); 168 169 // if any of these throw a StringIndexOutOfBoundsException 170 // we just fall into the catch bloclk and return null 171 // since this can't be well-formed XML 172 int position = declaration.indexOf("encoding") + 8; 173 char c; 174 175 // get rid of white space before equals sign 176 while (true) { 177 c = declaration.charAt(position++); 178 179 if ((c != ' ') && (c != '\t') && (c != '\r') && (c != '\n')) { 180 break; 181 } 182 } 183 184 if (c != '=') { // malformed 185 in.reset(); 186 187 return "UTF-8"; 188 } 189 190 // get rid of white space after equals sign 191 while (true) { 192 c = declaration.charAt(position++); 193 194 if ((c != ' ') && (c != '\t') && (c != '\r') && (c != '\n')) { 195 break; 196 } 197 } 198 199 char delimiter = c; 200 201 if ((delimiter != '\'') && (delimiter != '"')) { // malformed 202 in.reset(); 203 204 return "UTF-8"; 205 } 206 207 // now positioned to read encoding name 208 StringBuffer encodingName = new StringBuffer(); 209 210 while (true) { 211 c = declaration.charAt(position++); 212 213 if (c == delimiter) { 214 break; 215 } 216 217 encodingName.append(c); 218 } 219 220 in.reset(); 221 222 return encodingName.toString(); 223 } else if ((byte1 == 0x4C) && (byte2 == 0x6F) && (byte3 == 0xA7) 224 && (byte4 == 0x94)) { 225 // EBCDIC compatible, must read encoding declaration 226 // ???? 227 } 228 } catch (Exception e) { 229 in.reset(); 230 231 return "UTF-8"; 232 } 233 234 // no XML or text declaration present 235 in.reset(); 236 237 return "UTF-8"; 238 } 239 } 240 241 /* 242 * The contents of this file are subject to the Mozilla Public License Version 243 * 1.1 (the "License"); you may not use this file except in compliance with the 244 * License. You may obtain a copy of the License at http://www.mozilla.org/MPL/ 245 * Software distributed under the License is distributed on an "AS IS" basis, 246 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for 247 * the specific language governing rights and limitations under the License. The 248 * Original Code is: all this file. The Initial Developer of the Original Code 249 * is: Tomas Pitner, Masaryk University in Brno, Czech Republic. Contributor(s): 250 */