1212from html5lib import utils
1313from xml .sax .saxutils import escape
1414
15+ import re
16+
1517spaceCharacters = u"" .join (spaceCharacters )
1618
1719try :
@@ -84,6 +86,7 @@ class HTMLSerializer(object):
8486 resolve_entities = True
8587
8688 # miscellaneous options
89+ emit_doctype = 'preserve'
8790 inject_meta_charset = True
8891 strip_whitespace = False
8992 sanitize = False
@@ -92,13 +95,23 @@ class HTMLSerializer(object):
9295 "minimize_boolean_attributes" , "use_trailing_solidus" ,
9396 "space_before_trailing_solidus" , "omit_optional_tags" ,
9497 "strip_whitespace" , "inject_meta_charset" , "escape_lt_in_attrs" ,
95- "escape_rcdata" , "resolve_entities" , "sanitize" )
98+ "escape_rcdata" , "resolve_entities" , "emit_doctype" , " sanitize" )
9699
97100 def __init__ (self , ** kwargs ):
98101 """Initialize HTMLSerializer.
99102
100103 Keyword options (default given first unless specified) include:
101104
105+ emit_doctype='html'|'xhtml'|'html5'|'preserve'
106+ Whether to output a doctype.
107+ * emit_doctype='xhtml' preserves unknown doctypes and valid
108+ XHTML doctypes, converts valid HTML doctypes to their XHTML
109+ counterparts, and drops <!DOCTYPE html>
110+ * emit_doctype='html' preserves unknown doctypes and valid
111+ HTML doctypes, converts valid XHTML doctypes to their HTML
112+ counterparts, and uses <!DOCTYPE html> for missing doctypes
113+ * emit_doctype='html5' Uses <!DOCTYPE html> as the doctype
114+ * emit_doctype='preserve' preserves the doctype, if any, unchanged
102115 inject_meta_charset=True|False
103116 ..?
104117 quote_attr_values=True|False
@@ -140,6 +153,86 @@ def __init__(self, **kwargs):
140153 self .errors = []
141154 self .strict = False
142155
156+ def calc_doctype (self , token = None ):
157+ if self .emit_doctype == 'html5' or \
158+ not token and self .emit_doctype == 'html' :
159+ if token :
160+ return u'<!DOCTYPE html>'
161+ else :
162+ return u'<!DOCTYPE html>\n '
163+
164+ rootElement = token ["name" ]
165+ publicID = token ["publicId" ]
166+ systemID = token ["systemId" ]
167+
168+ if re .match (u'html' , rootElement , re .IGNORECASE ):
169+ if self .emit_doctype == u'html' :
170+ # XHTML 1.1
171+ if publicID == u"-//W3C//DTD XHTML 1.1//EN" and (not systemID \
172+ or systemID == u"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd" ):
173+ publicID = u"-//W3C//DTD HTML 4.01//EN"
174+ if systemID :
175+ systemID = u"http://www.w3.org/TR/html4/strict.dtd"
176+ # XHTML 1.0 Strict
177+ elif publicID == u"-//W3C//DTD XHTML 1.0 Strict//EN" and (not systemID \
178+ or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" ):
179+ publicID = u"-//W3C//DTD HTML 4.01//EN"
180+ if systemID :
181+ systemID = u"http://www.w3.org/TR/html4/strict.dtd"
182+ # XHTML 1.0 Transitional
183+ elif publicID == u"-//W3C//DTD XHTML 1.0 Transitional//EN" and (not systemID \
184+ or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" ):
185+ publicID = u"-//W3C//DTD HTML 4.01 Transitional//EN"
186+ if systemID :
187+ systemID = u"http://www.w3.org/TR/html4/loose.dtd"
188+ # XHTML 1.0 Frameset
189+ elif publicID == u"-//W3C//DTD XHTML 1.0 Frameset//EN" and (not systemID \
190+ or systemID == u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd" ):
191+ publicID = u"-//W3C//DTD HTML 4.01 Frameset//EN"
192+ if systemID :
193+ systemID = u"http://www.w3.org/TR/html4/frameset.dtd"
194+ elif self .emit_doctype == u'xhtml' :
195+ # HTML 4.01 Strict
196+ if re .match (u"-//W3C//DTD HTML 4.0(1)?//EN" , publicID ) and \
197+ (not systemID or \
198+ re .match (u"http://www.w3.org/TR/(html4|REC-html40)/strict.dtd" , systemID )):
199+ publicID = u"-//W3C//DTD XHTML 1.0 Strict//EN"
200+ if systemID :
201+ systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
202+ # HTML4.01 Transitional
203+ elif re .match (u"-//W3C//DTD HTML 4.0(1)? Transitional//EN" , publicID ) and \
204+ (not systemID or \
205+ re .match (u"http://www.w3.org/TR/(html4|REC-html40)/loose.dtd" , systemID )):
206+ publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN"
207+ if systemID :
208+ systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
209+ # HTML 4.01 Frameset
210+ elif re .match (u"-//W3C//DTD HTML 4.0(1)? Frameset//EN" , publicID ) and \
211+ (not systemID or \
212+ re .match (u"http://www.w3.org/TR/(html4|REC-html40)/frameset.dtd" , systemID )):
213+ publicID = u"-//W3C//DTD XHTML 1.0 Frameset//EN"
214+ if systemID :
215+ systemID = u"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"
216+ # HTML 3.2
217+ elif re .match (u"-//W3C//DTD HTML 3.2( Final)?//EN" , publicID ) and not systemID :
218+ publicID = u"-//W3C//DTD XHTML 1.0 Transitional//EN"
219+
220+ doctype = u"<!DOCTYPE %s" % rootElement
221+ if token ["publicId" ]:
222+ doctype += u' PUBLIC "%s"' % publicID
223+ elif systemID :
224+ doctype += u" SYSTEM"
225+ if systemID :
226+ if systemID .find (u'"' ) >= 0 :
227+ if systemID .find (u"'" ) >= 0 :
228+ self .serializeError (_ ("System identifer contains both single and double quote characters" ))
229+ quote_char = u"'"
230+ else :
231+ quote_char = u'"'
232+ doctype += u" %s%s%s" % (quote_char , systemID , quote_char )
233+ doctype += u">"
234+ return doctype
235+
143236 def serialize (self , treewalker , encoding = None ):
144237 in_cdata = False
145238 self .errors = []
@@ -157,26 +250,12 @@ def serialize(self, treewalker, encoding=None):
157250 if self .omit_optional_tags :
158251 from html5lib .filters .optionaltags import Filter
159252 treewalker = Filter (treewalker )
253+ posted_doctype = False
160254 for token in treewalker :
161255 type = token ["type" ]
162256 if type == "Doctype" :
163- doctype = u"<!DOCTYPE %s" % token ["name" ]
164-
165- if token ["publicId" ]:
166- doctype += u' PUBLIC "%s"' % token ["publicId" ]
167- elif token ["systemId" ]:
168- doctype += u" SYSTEM"
169- if token ["systemId" ]:
170- if token ["systemId" ].find (u'"' ) >= 0 :
171- if token ["systemId" ].find (u"'" ) >= 0 :
172- self .serializeError (_ ("System identifer contains both single and double quote characters" ))
173- quote_char = u"'"
174- else :
175- quote_char = u'"'
176- doctype += u" %s%s%s" % (quote_char , token ["systemId" ], quote_char )
177-
178- doctype += u">"
179-
257+ posted_doctype = True
258+ doctype = self .calc_doctype (token )
180259 if encoding :
181260 yield doctype .encode (encoding )
182261 else :
@@ -196,6 +275,9 @@ def serialize(self, treewalker, encoding=None):
196275 yield escape (token ["data" ])
197276
198277 elif type in ("StartTag" , "EmptyTag" ):
278+ if not posted_doctype :
279+ posted_doctype = True
280+ yield self .calc_doctype ()
199281 name = token ["name" ]
200282 if name in rcdataElements and not self .escape_rcdata :
201283 in_cdata = True
0 commit comments