blob: 60830779816a03e61e128801db514b0609fdfa9b [file] [log] [blame]
Olivier Deprezf4ef2d02021-04-20 13:36:24 +02001"""A parser for HTML and XHTML."""
2
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
11import re
12import _markupbase
13
14from html import unescape
15
16
17__all__ = ['HTMLParser']
18
19# Regular expressions used for parsing
20
21interesting_normal = re.compile('[&<]')
22incomplete = re.compile('&[a-zA-Z#]')
23
24entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
25charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
26
27starttagopen = re.compile('<[a-zA-Z]')
28piclose = re.compile('>')
29commentclose = re.compile(r'--\s*>')
30# Note:
31# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
32# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
33# explode, so don't do it.
34# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
35# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
36tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
37attrfind_tolerant = re.compile(
38 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
39 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
40locatestarttagend_tolerant = re.compile(r"""
41 <[a-zA-Z][^\t\n\r\f />\x00]* # tag name
42 (?:[\s/]* # optional whitespace before attribute name
43 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
44 (?:\s*=+\s* # value indicator
45 (?:'[^']*' # LITA-enclosed value
46 |"[^"]*" # LIT-enclosed value
47 |(?!['"])[^>\s]* # bare value
48 )
49 (?:\s*,)* # possibly followed by a comma
50 )?(?:\s|/(?!>))*
51 )*
52 )?
53 \s* # trailing whitespace
54""", re.VERBOSE)
55endendtag = re.compile('>')
56# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
57# </ and the tag name, so maybe this should be fixed
58endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
59
60
61
62class HTMLParser(_markupbase.ParserBase):
63 """Find tags and other markup and call handler functions.
64
65 Usage:
66 p = HTMLParser()
67 p.feed(data)
68 ...
69 p.close()
70
71 Start tags are handled by calling self.handle_starttag() or
72 self.handle_startendtag(); end tags by self.handle_endtag(). The
73 data between tags is passed from the parser to the derived class
74 by calling self.handle_data() with the data as argument (the data
75 may be split up in arbitrary chunks). If convert_charrefs is
76 True the character references are converted automatically to the
77 corresponding Unicode character (and self.handle_data() is no
78 longer split in chunks), otherwise they are passed by calling
79 self.handle_entityref() or self.handle_charref() with the string
80 containing respectively the named or numeric reference as the
81 argument.
82 """
83
84 CDATA_CONTENT_ELEMENTS = ("script", "style")
85
86 def __init__(self, *, convert_charrefs=True):
87 """Initialize and reset this instance.
88
89 If convert_charrefs is True (the default), all character references
90 are automatically converted to the corresponding Unicode characters.
91 """
92 self.convert_charrefs = convert_charrefs
93 self.reset()
94
95 def reset(self):
96 """Reset this instance. Loses all unprocessed data."""
97 self.rawdata = ''
98 self.lasttag = '???'
99 self.interesting = interesting_normal
100 self.cdata_elem = None
101 _markupbase.ParserBase.reset(self)
102
103 def feed(self, data):
104 r"""Feed data to the parser.
105
106 Call this as often as you want, with as little or as much text
107 as you want (may include '\n').
108 """
109 self.rawdata = self.rawdata + data
110 self.goahead(0)
111
112 def close(self):
113 """Handle any buffered data."""
114 self.goahead(1)
115
116 __starttag_text = None
117
118 def get_starttag_text(self):
119 """Return full source of start tag: '<...>'."""
120 return self.__starttag_text
121
122 def set_cdata_mode(self, elem):
123 self.cdata_elem = elem.lower()
124 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
125
126 def clear_cdata_mode(self):
127 self.interesting = interesting_normal
128 self.cdata_elem = None
129
130 # Internal -- handle data as far as reasonable. May leave state
131 # and data to be processed by a subsequent call. If 'end' is
132 # true, force handling all data as if followed by EOF marker.
133 def goahead(self, end):
134 rawdata = self.rawdata
135 i = 0
136 n = len(rawdata)
137 while i < n:
138 if self.convert_charrefs and not self.cdata_elem:
139 j = rawdata.find('<', i)
140 if j < 0:
141 # if we can't find the next <, either we are at the end
142 # or there's more text incoming. If the latter is True,
143 # we can't pass the text to handle_data in case we have
144 # a charref cut in half at end. Try to determine if
145 # this is the case before proceeding by looking for an
146 # & near the end and see if it's followed by a space or ;.
147 amppos = rawdata.rfind('&', max(i, n-34))
148 if (amppos >= 0 and
149 not re.compile(r'[\s;]').search(rawdata, amppos)):
150 break # wait till we get all the text
151 j = n
152 else:
153 match = self.interesting.search(rawdata, i) # < or &
154 if match:
155 j = match.start()
156 else:
157 if self.cdata_elem:
158 break
159 j = n
160 if i < j:
161 if self.convert_charrefs and not self.cdata_elem:
162 self.handle_data(unescape(rawdata[i:j]))
163 else:
164 self.handle_data(rawdata[i:j])
165 i = self.updatepos(i, j)
166 if i == n: break
167 startswith = rawdata.startswith
168 if startswith('<', i):
169 if starttagopen.match(rawdata, i): # < + letter
170 k = self.parse_starttag(i)
171 elif startswith("</", i):
172 k = self.parse_endtag(i)
173 elif startswith("<!--", i):
174 k = self.parse_comment(i)
175 elif startswith("<?", i):
176 k = self.parse_pi(i)
177 elif startswith("<!", i):
178 k = self.parse_html_declaration(i)
179 elif (i + 1) < n:
180 self.handle_data("<")
181 k = i + 1
182 else:
183 break
184 if k < 0:
185 if not end:
186 break
187 k = rawdata.find('>', i + 1)
188 if k < 0:
189 k = rawdata.find('<', i + 1)
190 if k < 0:
191 k = i + 1
192 else:
193 k += 1
194 if self.convert_charrefs and not self.cdata_elem:
195 self.handle_data(unescape(rawdata[i:k]))
196 else:
197 self.handle_data(rawdata[i:k])
198 i = self.updatepos(i, k)
199 elif startswith("&#", i):
200 match = charref.match(rawdata, i)
201 if match:
202 name = match.group()[2:-1]
203 self.handle_charref(name)
204 k = match.end()
205 if not startswith(';', k-1):
206 k = k - 1
207 i = self.updatepos(i, k)
208 continue
209 else:
210 if ";" in rawdata[i:]: # bail by consuming &#
211 self.handle_data(rawdata[i:i+2])
212 i = self.updatepos(i, i+2)
213 break
214 elif startswith('&', i):
215 match = entityref.match(rawdata, i)
216 if match:
217 name = match.group(1)
218 self.handle_entityref(name)
219 k = match.end()
220 if not startswith(';', k-1):
221 k = k - 1
222 i = self.updatepos(i, k)
223 continue
224 match = incomplete.match(rawdata, i)
225 if match:
226 # match.group() will contain at least 2 chars
227 if end and match.group() == rawdata[i:]:
228 k = match.end()
229 if k <= i:
230 k = n
231 i = self.updatepos(i, i + 1)
232 # incomplete
233 break
234 elif (i + 1) < n:
235 # not the end of the buffer, and can't be confused
236 # with some other construct
237 self.handle_data("&")
238 i = self.updatepos(i, i + 1)
239 else:
240 break
241 else:
242 assert 0, "interesting.search() lied"
243 # end while
244 if end and i < n and not self.cdata_elem:
245 if self.convert_charrefs and not self.cdata_elem:
246 self.handle_data(unescape(rawdata[i:n]))
247 else:
248 self.handle_data(rawdata[i:n])
249 i = self.updatepos(i, n)
250 self.rawdata = rawdata[i:]
251
252 # Internal -- parse html declarations, return length or -1 if not terminated
253 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
254 # See also parse_declaration in _markupbase
255 def parse_html_declaration(self, i):
256 rawdata = self.rawdata
257 assert rawdata[i:i+2] == '<!', ('unexpected call to '
258 'parse_html_declaration()')
259 if rawdata[i:i+4] == '<!--':
260 # this case is actually already handled in goahead()
261 return self.parse_comment(i)
262 elif rawdata[i:i+3] == '<![':
263 return self.parse_marked_section(i)
264 elif rawdata[i:i+9].lower() == '<!doctype':
265 # find the closing >
266 gtpos = rawdata.find('>', i+9)
267 if gtpos == -1:
268 return -1
269 self.handle_decl(rawdata[i+2:gtpos])
270 return gtpos+1
271 else:
272 return self.parse_bogus_comment(i)
273
274 # Internal -- parse bogus comment, return length or -1 if not terminated
275 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
276 def parse_bogus_comment(self, i, report=1):
277 rawdata = self.rawdata
278 assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
279 'parse_comment()')
280 pos = rawdata.find('>', i+2)
281 if pos == -1:
282 return -1
283 if report:
284 self.handle_comment(rawdata[i+2:pos])
285 return pos + 1
286
287 # Internal -- parse processing instr, return end or -1 if not terminated
288 def parse_pi(self, i):
289 rawdata = self.rawdata
290 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
291 match = piclose.search(rawdata, i+2) # >
292 if not match:
293 return -1
294 j = match.start()
295 self.handle_pi(rawdata[i+2: j])
296 j = match.end()
297 return j
298
299 # Internal -- handle starttag, return end or -1 if not terminated
300 def parse_starttag(self, i):
301 self.__starttag_text = None
302 endpos = self.check_for_whole_start_tag(i)
303 if endpos < 0:
304 return endpos
305 rawdata = self.rawdata
306 self.__starttag_text = rawdata[i:endpos]
307
308 # Now parse the data between i+1 and j into a tag and attrs
309 attrs = []
310 match = tagfind_tolerant.match(rawdata, i+1)
311 assert match, 'unexpected call to parse_starttag()'
312 k = match.end()
313 self.lasttag = tag = match.group(1).lower()
314 while k < endpos:
315 m = attrfind_tolerant.match(rawdata, k)
316 if not m:
317 break
318 attrname, rest, attrvalue = m.group(1, 2, 3)
319 if not rest:
320 attrvalue = None
321 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
322 attrvalue[:1] == '"' == attrvalue[-1:]:
323 attrvalue = attrvalue[1:-1]
324 if attrvalue:
325 attrvalue = unescape(attrvalue)
326 attrs.append((attrname.lower(), attrvalue))
327 k = m.end()
328
329 end = rawdata[k:endpos].strip()
330 if end not in (">", "/>"):
331 lineno, offset = self.getpos()
332 if "\n" in self.__starttag_text:
333 lineno = lineno + self.__starttag_text.count("\n")
334 offset = len(self.__starttag_text) \
335 - self.__starttag_text.rfind("\n")
336 else:
337 offset = offset + len(self.__starttag_text)
338 self.handle_data(rawdata[i:endpos])
339 return endpos
340 if end.endswith('/>'):
341 # XHTML-style empty tag: <span attr="value" />
342 self.handle_startendtag(tag, attrs)
343 else:
344 self.handle_starttag(tag, attrs)
345 if tag in self.CDATA_CONTENT_ELEMENTS:
346 self.set_cdata_mode(tag)
347 return endpos
348
349 # Internal -- check to see if we have a complete starttag; return end
350 # or -1 if incomplete.
351 def check_for_whole_start_tag(self, i):
352 rawdata = self.rawdata
353 m = locatestarttagend_tolerant.match(rawdata, i)
354 if m:
355 j = m.end()
356 next = rawdata[j:j+1]
357 if next == ">":
358 return j + 1
359 if next == "/":
360 if rawdata.startswith("/>", j):
361 return j + 2
362 if rawdata.startswith("/", j):
363 # buffer boundary
364 return -1
365 # else bogus input
366 if j > i:
367 return j
368 else:
369 return i + 1
370 if next == "":
371 # end of input
372 return -1
373 if next in ("abcdefghijklmnopqrstuvwxyz=/"
374 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
375 # end of input in or before attribute value, or we have the
376 # '/' from a '/>' ending
377 return -1
378 if j > i:
379 return j
380 else:
381 return i + 1
382 raise AssertionError("we should not get here!")
383
384 # Internal -- parse endtag, return end or -1 if incomplete
385 def parse_endtag(self, i):
386 rawdata = self.rawdata
387 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
388 match = endendtag.search(rawdata, i+1) # >
389 if not match:
390 return -1
391 gtpos = match.end()
392 match = endtagfind.match(rawdata, i) # </ + tag + >
393 if not match:
394 if self.cdata_elem is not None:
395 self.handle_data(rawdata[i:gtpos])
396 return gtpos
397 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
398 namematch = tagfind_tolerant.match(rawdata, i+2)
399 if not namematch:
400 # w3.org/TR/html5/tokenization.html#end-tag-open-state
401 if rawdata[i:i+3] == '</>':
402 return i+3
403 else:
404 return self.parse_bogus_comment(i)
405 tagname = namematch.group(1).lower()
406 # consume and ignore other stuff between the name and the >
407 # Note: this is not 100% correct, since we might have things like
408 # </tag attr=">">, but looking for > after tha name should cover
409 # most of the cases and is much simpler
410 gtpos = rawdata.find('>', namematch.end())
411 self.handle_endtag(tagname)
412 return gtpos+1
413
414 elem = match.group(1).lower() # script or style
415 if self.cdata_elem is not None:
416 if elem != self.cdata_elem:
417 self.handle_data(rawdata[i:gtpos])
418 return gtpos
419
420 self.handle_endtag(elem)
421 self.clear_cdata_mode()
422 return gtpos
423
424 # Overridable -- finish processing of start+end tag: <tag.../>
425 def handle_startendtag(self, tag, attrs):
426 self.handle_starttag(tag, attrs)
427 self.handle_endtag(tag)
428
429 # Overridable -- handle start tag
430 def handle_starttag(self, tag, attrs):
431 pass
432
433 # Overridable -- handle end tag
434 def handle_endtag(self, tag):
435 pass
436
437 # Overridable -- handle character reference
438 def handle_charref(self, name):
439 pass
440
441 # Overridable -- handle entity reference
442 def handle_entityref(self, name):
443 pass
444
445 # Overridable -- handle data
446 def handle_data(self, data):
447 pass
448
449 # Overridable -- handle comment
450 def handle_comment(self, data):
451 pass
452
453 # Overridable -- handle declaration
454 def handle_decl(self, decl):
455 pass
456
457 # Overridable -- handle processing instruction
458 def handle_pi(self, data):
459 pass
460
461 def unknown_decl(self, data):
462 pass