EncodingHeuristics xref

View Javadoc

1   /*--
2    Copyright 2001, 2002 Elliotte Rusty Harold.
3    All rights reserved.
4    This file is part of XIncluder, a Java class library for integrating XInclude
5    processing with SAX, DOM, and JDOM.
6    XIncluder is free software; you can redistribute it and/or modify
7    it under the terms of the GNU Lesser General Public License version 2.1
8    as published by the Free Software Foundation.
9    XIncluder is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   GNU Lesser General Public License for more details.
13   You should have received a copy of the GNU Lesser General Public License
14   along with XIncluder; if not, write to the Free Software
15   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16   THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED
17   WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18   OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19   DISCLAIMED.  IN NO EVENT SHALL ELLIOTTE RUSTY HAROLD OR ANY
20   OTHER CONTRIBUTORS TO THIS PACKAGE
21   BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
24   USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
27   OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28   SUCH DAMAGE.
29   */
30  package net.sf.tomp.xml.include;
31  
32  import java.io.IOException;
33  import java.io.InputStream;
34  
35  /***
36   * <p>
37   * <code>EncodingHeuristics</code> reads from a stream (which should be
38   * buffered) and attempts to guess what the encoding of the text in the stream
39   * is. Byte order marks are stripped from the stream. If it fails to determine
40   * the type of the encoding, it returns the default UTF-8.
41   * </p>
42   * 
43   * @author Elliotte Rusty Harold
44   * @version 1.0d9, July 4, 2002
45   */
46  public class EncodingHeuristics {
47      // No instances allowed
48      private EncodingHeuristics() {
49      }
50  
51      /***
52       * <p>
53       * This utility method ????.
54       * </p>
55       * 
56       * @param in <code>InputStream</code> to read from.
57       * @return String The name of the encoding.
58       * @throws IOException if the stream cannot be reset back to where it was
59       *             when the method was invoked.
60       */
61      public static String readEncodingFromStream(InputStream in)
62              throws IOException {
63          // This may fail if there are a lot of space characters before the end
64          // of the encoding declaration
65          in.mark(1024);
66  
67          try {
68              // lots of things can go wrong here. If any do, I just return null
69              // so that we'll fall back on the encoding declaration or the
70              // UTF-8 default
71              int byte1 = in.read();
72              int byte2 = in.read();
73  
74              if ((byte1 == 0xFE) && (byte2 == 0xFF)) {
75                  // don't reset because the byte order mark should not be
76                  // included????
77                  return "UnicodeBig"; // name for big-endian????
78              } else if ((byte1 == 0xFF) && (byte2 == 0xFE)) {
79                  // don't reset because the byte order mark should not be
80                  // included????
81                  // will the reader throw away the byte order mark or will it
82                  // return it????
83                  return "UnicodeLittle";
84              }
85  
86              /*
87               * In accordance with the Character Model [Character Model], when
88               * the text format is a Unicode encoding, the XInclude processor
89               * must fail the inclusion when the text in the selected range is
90               * non-normalized. When transcoding characters to a Unicode encoding
91               * from a legacy encoding, a normalizing transcoder must be used.
92               */
93              int byte3 = in.read();
94  
95              // check for UTF-8 byte order mark
96              if ((byte1 == 0xEF) && (byte2 == 0xBB) && (byte3 == 0xBF)) {
97                  // don't reset because the byte order mark should not be
98                  // included????
99                  // in general what happens if text document includes non-XML
100                 // legal chars????
101                 return "UTF-8";
102             }
103 
104             int byte4 = in.read();
105 
106             if ((byte1 == 0x00) && (byte2 == 0x00) && (byte3 == 0xFE)
107                     && (byte4 == 0xFF)) {
108                 // don't reset because the byte order mark should not be
109                 // included????
110                 return "UCS-4"; // right name for big-endian UCS-4 in Java
111 
112                 // 1.4????
113             } else if ((byte1 == 0x00) && (byte2 == 0x00) && (byte3 == 0xFF)
114                     && (byte4 == 0xFE)) {
115                 // don't reset because the byte order mark should not be
116                 // included????
117                 return "UCS-4"; // right name for little-endian UCS-4 in Java
118 
119                 // 1.4????
120             }
121 
122             // no byte order mark present; first character must be
123             // less than sign or white space
124             // Let's look for less-than signs first
125             if ((byte1 == 0x00) && (byte2 == 0x00) && (byte3 == 0x00)
126                     && (byte4 == '<')) {
127                 in.reset();
128 
129                 return "UCS-4"; // right name for big-endian UCS-4 in Java
130 
131                 // 1.4????
132             } else if ((byte1 == '<') && (byte2 == 0x00) && (byte3 == 0x00)
133                     && (byte4 == 0x00)) {
134                 in.reset();
135 
136                 return "UCS-4"; // right name for little-endian UCS-4 in Java
137 
138                 // 1.4????
139             } else if ((byte1 == 0x00) && (byte2 == '<') && (byte3 == 0x00)
140                     && (byte4 == '?')) {
141                 in.reset();
142 
143                 return "UnicodeBigUnmarked";
144             } else if ((byte1 == '<') && (byte2 == 0x00) && (byte3 == '?')
145                     && (byte4 == 0x00)) {
146                 in.reset();
147 
148                 return "UnicodeLittleUnmarked";
149             } else if ((byte1 == '<') && (byte2 == '?') && (byte3 == 'x')
150                     && (byte4 == 'm')) {
151                 // ASCII compatible, must read encoding declaration
152                 // 1024 bytes will be far enough to read most XML declarations
153                 byte[] data = new byte[1024];
154 
155                 data[0] = (byte) byte1;
156                 data[1] = (byte) byte2;
157                 data[2] = (byte) byte3;
158                 data[3] = (byte) byte4;
159 
160                 int length = in.read(data, 4, 1020) + 4;
161 
162                 // Use Latin-1 (ISO-8859-1) because it's ASCII compatible and
163                 // all byte sequences are legal Latin-1 sequences so I don't
164                 // have
165                 // to worry about encoding errors if I slip past the
166                 // end of the XML/text declaration
167                 String declaration = new String(data, 0, length, "8859_1");
168 
169                 // if any of these throw a StringIndexOutOfBoundsException
170                 // we just fall into the catch bloclk and return null
171                 // since this can't be well-formed XML
172                 int position = declaration.indexOf("encoding") + 8;
173                 char c;
174 
175                 // get rid of white space before equals sign
176                 while (true) {
177                     c = declaration.charAt(position++);
178 
179                     if ((c != ' ') && (c != '\t') && (c != '\r') && (c != '\n')) {
180                         break;
181                     }
182                 }
183 
184                 if (c != '=') { // malformed
185                     in.reset();
186 
187                     return "UTF-8";
188                 }
189 
190                 // get rid of white space after equals sign
191                 while (true) {
192                     c = declaration.charAt(position++);
193 
194                     if ((c != ' ') && (c != '\t') && (c != '\r') && (c != '\n')) {
195                         break;
196                     }
197                 }
198 
199                 char delimiter = c;
200 
201                 if ((delimiter != '\'') && (delimiter != '"')) { // malformed
202                     in.reset();
203 
204                     return "UTF-8";
205                 }
206 
207                 // now positioned to read encoding name
208                 StringBuffer encodingName = new StringBuffer();
209 
210                 while (true) {
211                     c = declaration.charAt(position++);
212 
213                     if (c == delimiter) {
214                         break;
215                     }
216 
217                     encodingName.append(c);
218                 }
219 
220                 in.reset();
221 
222                 return encodingName.toString();
223             } else if ((byte1 == 0x4C) && (byte2 == 0x6F) && (byte3 == 0xA7)
224                     && (byte4 == 0x94)) {
225                 // EBCDIC compatible, must read encoding declaration
226                 // ????
227             }
228         } catch (Exception e) {
229             in.reset();
230 
231             return "UTF-8";
232         }
233 
234         // no XML or text declaration present
235         in.reset();
236 
237         return "UTF-8";
238     }
239 }
240 
241 /*
242  * The contents of this file are subject to the Mozilla Public License Version
243  * 1.1 (the "License"); you may not use this file except in compliance with the
244  * License. You may obtain a copy of the License at http://www.mozilla.org/MPL/
245  * Software distributed under the License is distributed on an "AS IS" basis,
246  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
247  * the specific language governing rights and limitations under the License. The
248  * Original Code is: all this file. The Initial Developer of the Original Code
249  * is: Tomas Pitner, Masaryk University in Brno, Czech Republic. Contributor(s):
250  */