1 package org.owasp.dependencycheck.xml;
2
3 import java.io.FilterInputStream;
4 import java.io.IOException;
5 import java.io.InputStream;
6 import javax.annotation.concurrent.NotThreadSafe;
7
8 import org.jspecify.annotations.NonNull;
9 import org.slf4j.Logger;
10 import org.slf4j.LoggerFactory;
11
12 /**
13 * Cleans up often very bad XML. Primarily, this will convert named HTM entities
14 * into their HTM encoded Unicode code point representation.
15 *
16 * <ol>
17 * <li>Strips leading white space</li>
18 * <li>Recodes &pound; etc to &#...;</li>
19 * <li>Recodes lone & as &amp;</li>
20 * </ol>
21 * <p>
22 * This is a slightly modified (class/method rename) from an SO answer:
23 * https://stackoverflow.com/questions/7286428/help-the-java-sax-parser-to-understand-bad-xml</p>
24 *
25 * @author https://stackoverflow.com/users/823393/oldcurmudgeon
26 */
27 @NotThreadSafe
28 public class XmlInputStream extends FilterInputStream {
29
30 /**
31 * The logger.
32 */
33 private static final Logger LOGGER = LoggerFactory.getLogger(XmlInputStream.class);
34 /**
35 * The minimum length of characters to read.
36 */
37 private static final int MIN_LENGTH = 2;
38 /**
39 * Holder for everything we've read.
40 */
41 private final StringBuilder red = new StringBuilder();
42 /**
43 * Data that needs to be pushed back.
44 */
45 private final StringBuilder pushBack = new StringBuilder();
46 /**
47 * How much we've given them.
48 */
49 private int given = 0;
50 /**
51 * How much we've read.
52 */
53 private int pulled = 0;
54
55 /**
56 * Constructs a new XML Input Stream.
57 *
58 * @param in the base input stream
59 */
60 public XmlInputStream(InputStream in) {
61 super(in);
62 }
63
64 /**
65 * NB: This is a Troll length (i.e. it goes 1, 2, many) so 2 actually means
66 * "at least 2"
67 *
68 * @return the length
69 */
70 public int length() {
71 try {
72 final StringBuilder s = read(MIN_LENGTH);
73 pushBack.append(s);
74 return s.length();
75 } catch (IOException ex) {
76 LOGGER.warn("Oops ", ex);
77 }
78 return 0;
79 }
80
81 /**
82 * Read n characters.
83 *
84 * @param n the number of characters to read
85 * @return the characters read
86 * @throws IOException thrown when an error occurs
87 */
88 private StringBuilder read(int n) throws IOException {
89 // Input stream finished?
90 boolean eof = false;
91 // Read that many.
92 final StringBuilder s = new StringBuilder(n);
93 while (s.length() < n && !eof) {
94 // Always get from the pushBack buffer.
95 if (pushBack.length() == 0) {
96 // Read something from the stream into pushBack.
97 eof = readIntoPushBack();
98 }
99
100 // Pushback only contains deliverable codes.
101 if (pushBack.length() > 0) {
102 // Grab one character
103 s.append(pushBack.charAt(0));
104 // Remove it from pushBack
105 pushBack.deleteCharAt(0);
106 }
107
108 }
109 return s;
110 }
111
112 /**
113 * Might not actually push back anything but usually will.
114 *
115 * @return true if at end-of-file
116 * @throws IOException thrown if there is an IO exception in the underlying
117 * steam
118 */
119 private boolean readIntoPushBack() throws IOException {
120 // File finished?
121 boolean eof = false;
122 // Next char.
123 final int ch = in.read();
124 if (ch >= 0) {
125 // Discard whitespace at start?
126 if (!(pulled == 0 && isWhiteSpace(ch))) {
127 // Good code.
128 pulled += 1;
129 // Parse out the &stuff;
130 if (ch == '&') {
131 // Process the &
132 readAmpersand();
133 } else {
134 // Not an '&', just append.
135 pushBack.append((char) ch);
136 }
137 }
138 } else {
139 // Hit end of file.
140 eof = true;
141 }
142 return eof;
143 }
144
145 /**
146 * Deal with an ampersand in the stream.
147 *
148 * @throws IOException thrown if an unknown entity is encountered
149 */
150 private void readAmpersand() throws IOException {
151 // Read the whole word, up to and including the ;
152 final StringBuilder reference = new StringBuilder();
153 int ch;
154 // Should end in a ';'
155 for (ch = in.read(); isAlphaNumeric(ch); ch = in.read()) {
156 reference.append((char) ch);
157 }
158 // Did we tidily finish?
159 if (ch == ';') {
160 // Yes! Translate it into a &#nnn; code.
161 final String code = XmlEntity.fromNamedReference(reference);
162 if (code != null) {
163 // Keep it.
164 pushBack.append(code);
165 } else {
166 // invalid entity. Encode the & and append the sequence of chars.
167 pushBack.append("&").append(reference).append((char) ch);
168 }
169 } else {
170 // Did not terminate properly!
171 // Perhaps an & on its own or a malformed reference.
172 // Either way, escape the &
173 pushBack.append("&").append(reference).append((char) ch);
174 }
175 }
176
177 /**
178 * Keep track of what we've given them.
179 *
180 * @param s the sequence of characters given
181 * @param wanted the number of characters wanted
182 * @param got the number of characters given
183 */
184 private void given(CharSequence s, int wanted, int got) {
185 red.append(s);
186 given += got;
187 LOGGER.trace("Given: [" + wanted + "," + got + "]-" + s);
188 }
189
190 /**
191 * Reads the next byte.
192 *
193 * @return the byte read
194 * @throws IOException thrown when there is an problem reading
195 */
196 @Override
197 public int read() throws IOException {
198 final StringBuilder s = read(1);
199 given(s, 1, 1);
200 return s.length() > 0 ? s.charAt(0) : -1;
201 }
202
203 /**
204 * Reads the next length of bytes from the stream into the given byte array
205 * at the given offset.
206 *
207 * @param data the buffer to store the data read
208 * @param offset the offset in the buffer to start writing
209 * @param length the length of data to read
210 * @return the number of bytes read
211 * @throws IOException thrown when there is an issue with the underlying
212 * stream
213 */
214 @Override
215 public int read(@NonNull byte[] data, int offset, int length) throws IOException {
216 final StringBuilder s = read(length);
217 int n = 0;
218 for (int i = 0; i < Math.min(length, s.length()); i++) {
219 data[offset + i] = (byte) s.charAt(i);
220 n += 1;
221 }
222 given(s, length, n);
223 return n > 0 ? n : -1;
224 }
225
226 /**
227 * To string implementation.
228 *
229 * @return a string representation of the data given and read from the
230 * stream.
231 */
232 @Override
233 public String toString() {
234 final String s = red.toString();
235 final StringBuilder h = new StringBuilder();
236 // Hex dump the small ones.
237 if (s.length() < 8) {
238 for (int i = 0; i < s.length(); i++) {
239 h.append(" ").append(Integer.toHexString(s.charAt(i)));
240 }
241 }
242 return "[" + given + "]-\"" + s + "\"" + (h.length() > 0 ? " (" + h + ")" : "");
243 }
244
245 /**
246 * Determines if the character is whitespace.
247 *
248 * @param ch the character to check
249 * @return true if the character is whitespace; otherwise false
250 */
251 private boolean isWhiteSpace(int ch) {
252 switch (ch) {
253 case ' ':
254 case '\r':
255 case '\n':
256 case '\t':
257 return true;
258 default:
259 return false;
260 }
261 }
262
263 /**
264 * Checks if the given character is alpha-numeric.
265 *
266 * @param ch the character to check
267 * @return true if the character is alpha-numeric; otherwise false.
268 */
269 private boolean isAlphaNumeric(int ch) {
270 return ('a' <= ch && ch <= 'z')
271 || ('A' <= ch && ch <= 'Z')
272 || ('0' <= ch && ch <= '9');
273 }
274 }