1 /*--
2 Copyright 2001, 2002 Elliotte Rusty Harold.
3 All rights reserved.
4 This file is part of XIncluder, a Java class library for integrating XInclude
5 processing with SAX, DOM, and JDOM.
6 XIncluder is free software; you can redistribute it and/or modify
7 it under the terms of the GNU Lesser General Public License version 2.1
8 as published by the Free Software Foundation.
9 XIncluder is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU Lesser General Public License for more details.
13 You should have received a copy of the GNU Lesser General Public License
14 along with XIncluder; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED
17 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 DISCLAIMED. IN NO EVENT SHALL ELLIOTTE RUSTY HAROLD OR ANY
20 OTHER CONTRIBUTORS TO THIS PACKAGE
21 BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
24 USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
27 OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 SUCH DAMAGE.
29 */
30 package net.sf.tomp.xml.include;
31
32 import java.io.IOException;
33 import java.io.InputStream;
34
35 /***
36 * <p>
37 * <code>EncodingHeuristics</code> reads from a stream (which should be
38 * buffered) and attempts to guess what the encoding of the text in the stream
39 * is. Byte order marks are stripped from the stream. If it fails to determine
40 * the type of the encoding, it returns the default UTF-8.
41 * </p>
42 *
43 * @author Elliotte Rusty Harold
44 * @version 1.0d9, July 4, 2002
45 */
46 public class EncodingHeuristics {
47 // No instances allowed
48 private EncodingHeuristics() {
49 }
50
51 /***
52 * <p>
53 * This utility method ????.
54 * </p>
55 *
56 * @param in <code>InputStream</code> to read from.
57 * @return String The name of the encoding.
58 * @throws IOException if the stream cannot be reset back to where it was
59 * when the method was invoked.
60 */
61 public static String readEncodingFromStream(InputStream in)
62 throws IOException {
63 // This may fail if there are a lot of space characters before the end
64 // of the encoding declaration
65 in.mark(1024);
66
67 try {
68 // lots of things can go wrong here. If any do, I just return null
69 // so that we'll fall back on the encoding declaration or the
70 // UTF-8 default
71 int byte1 = in.read();
72 int byte2 = in.read();
73
74 if ((byte1 == 0xFE) && (byte2 == 0xFF)) {
75 // don't reset because the byte order mark should not be
76 // included????
77 return "UnicodeBig"; // name for big-endian????
78 } else if ((byte1 == 0xFF) && (byte2 == 0xFE)) {
79 // don't reset because the byte order mark should not be
80 // included????
81 // will the reader throw away the byte order mark or will it
82 // return it????
83 return "UnicodeLittle";
84 }
85
86 /*
87 * In accordance with the Character Model [Character Model], when
88 * the text format is a Unicode encoding, the XInclude processor
89 * must fail the inclusion when the text in the selected range is
90 * non-normalized. When transcoding characters to a Unicode encoding
91 * from a legacy encoding, a normalizing transcoder must be used.
92 */
93 int byte3 = in.read();
94
95 // check for UTF-8 byte order mark
96 if ((byte1 == 0xEF) && (byte2 == 0xBB) && (byte3 == 0xBF)) {
97 // don't reset because the byte order mark should not be
98 // included????
99 // in general what happens if text document includes non-XML
100 // legal chars????
101 return "UTF-8";
102 }
103
104 int byte4 = in.read();
105
106 if ((byte1 == 0x00) && (byte2 == 0x00) && (byte3 == 0xFE)
107 && (byte4 == 0xFF)) {
108 // don't reset because the byte order mark should not be
109 // included????
110 return "UCS-4"; // right name for big-endian UCS-4 in Java
111
112 // 1.4????
113 } else if ((byte1 == 0x00) && (byte2 == 0x00) && (byte3 == 0xFF)
114 && (byte4 == 0xFE)) {
115 // don't reset because the byte order mark should not be
116 // included????
117 return "UCS-4"; // right name for little-endian UCS-4 in Java
118
119 // 1.4????
120 }
121
122 // no byte order mark present; first character must be
123 // less than sign or white space
124 // Let's look for less-than signs first
125 if ((byte1 == 0x00) && (byte2 == 0x00) && (byte3 == 0x00)
126 && (byte4 == '<')) {
127 in.reset();
128
129 return "UCS-4"; // right name for big-endian UCS-4 in Java
130
131 // 1.4????
132 } else if ((byte1 == '<') && (byte2 == 0x00) && (byte3 == 0x00)
133 && (byte4 == 0x00)) {
134 in.reset();
135
136 return "UCS-4"; // right name for little-endian UCS-4 in Java
137
138 // 1.4????
139 } else if ((byte1 == 0x00) && (byte2 == '<') && (byte3 == 0x00)
140 && (byte4 == '?')) {
141 in.reset();
142
143 return "UnicodeBigUnmarked";
144 } else if ((byte1 == '<') && (byte2 == 0x00) && (byte3 == '?')
145 && (byte4 == 0x00)) {
146 in.reset();
147
148 return "UnicodeLittleUnmarked";
149 } else if ((byte1 == '<') && (byte2 == '?') && (byte3 == 'x')
150 && (byte4 == 'm')) {
151 // ASCII compatible, must read encoding declaration
152 // 1024 bytes will be far enough to read most XML declarations
153 byte[] data = new byte[1024];
154
155 data[0] = (byte) byte1;
156 data[1] = (byte) byte2;
157 data[2] = (byte) byte3;
158 data[3] = (byte) byte4;
159
160 int length = in.read(data, 4, 1020) + 4;
161
162 // Use Latin-1 (ISO-8859-1) because it's ASCII compatible and
163 // all byte sequences are legal Latin-1 sequences so I don't
164 // have
165 // to worry about encoding errors if I slip past the
166 // end of the XML/text declaration
167 String declaration = new String(data, 0, length, "8859_1");
168
169 // if any of these throw a StringIndexOutOfBoundsException
170 // we just fall into the catch bloclk and return null
171 // since this can't be well-formed XML
172 int position = declaration.indexOf("encoding") + 8;
173 char c;
174
175 // get rid of white space before equals sign
176 while (true) {
177 c = declaration.charAt(position++);
178
179 if ((c != ' ') && (c != '\t') && (c != '\r') && (c != '\n')) {
180 break;
181 }
182 }
183
184 if (c != '=') { // malformed
185 in.reset();
186
187 return "UTF-8";
188 }
189
190 // get rid of white space after equals sign
191 while (true) {
192 c = declaration.charAt(position++);
193
194 if ((c != ' ') && (c != '\t') && (c != '\r') && (c != '\n')) {
195 break;
196 }
197 }
198
199 char delimiter = c;
200
201 if ((delimiter != '\'') && (delimiter != '"')) { // malformed
202 in.reset();
203
204 return "UTF-8";
205 }
206
207 // now positioned to read encoding name
208 StringBuffer encodingName = new StringBuffer();
209
210 while (true) {
211 c = declaration.charAt(position++);
212
213 if (c == delimiter) {
214 break;
215 }
216
217 encodingName.append(c);
218 }
219
220 in.reset();
221
222 return encodingName.toString();
223 } else if ((byte1 == 0x4C) && (byte2 == 0x6F) && (byte3 == 0xA7)
224 && (byte4 == 0x94)) {
225 // EBCDIC compatible, must read encoding declaration
226 // ????
227 }
228 } catch (Exception e) {
229 in.reset();
230
231 return "UTF-8";
232 }
233
234 // no XML or text declaration present
235 in.reset();
236
237 return "UTF-8";
238 }
239 }
240
241 /*
242 * The contents of this file are subject to the Mozilla Public License Version
243 * 1.1 (the "License"); you may not use this file except in compliance with the
244 * License. You may obtain a copy of the License at http://www.mozilla.org/MPL/
245 * Software distributed under the License is distributed on an "AS IS" basis,
246 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
247 * the specific language governing rights and limitations under the License. The
248 * Original Code is: all this file. The Initial Developer of the Original Code
249 * is: Tomas Pitner, Masaryk University in Brno, Czech Republic. Contributor(s):
250 */