1 """Functions to run cherrypy.response through Tidy or NSGML."""
2
3 import cgi
4 import os
5 import StringIO
6 import traceback
7
8 import cherrypy
9
10 -def tidy(temp_dir, tidy_path, strict_xml=False, errors_to_ignore=None,
11 indent=False, wrap=False, warnings=True):
12 """Run cherrypy.response through Tidy.
13
14 If either 'indent' or 'wrap' are specified, then response.body will be
15 set to the output of tidy. Otherwise, only errors (including warnings,
16 if warnings is True) will change the body.
17
18 Note that we use the standalone Tidy tool rather than the python
19 mxTidy module. This is because this module does not seem to be
20 stable and it crashes on some HTML pages (which means that the
21 server would also crash)
22 """
23 response = cherrypy.response
24
25
26
27 orig_body = response.collapse_body()
28
29 fct = response.headers.get('Content-Type', '')
30 ct = fct.split(';')[0]
31 encoding = ''
32 i = fct.find('charset=')
33 if i != -1:
34 encoding = fct[i + 8:]
35
36 if ct == 'text/html':
37 page_file = os.path.join(temp_dir, 'page.html')
38 open(page_file, 'wb').write(orig_body)
39
40 out_file = os.path.join(temp_dir, 'tidy.out')
41 err_file = os.path.join(temp_dir, 'tidy.err')
42 tidy_enc = encoding.replace('-', '')
43 if tidy_enc:
44 tidy_enc = '-' + tidy_enc
45
46 strict_xml = ("", " -xml")[bool(strict_xml)]
47
48 if indent:
49 indent = ' -indent'
50 else:
51 indent = ''
52
53 if wrap is False:
54 wrap = ''
55 else:
56 try:
57 wrap = ' -wrap %d' % int(tidyWrap)
58 except:
59 wrap = ''
60
61 result = os.system('"%s" %s%s%s%s -f %s -o %s %s' %
62 (tidy_path, tidy_enc, strict_xml, indent, wrap,
63 err_file, out_file, page_file))
64 use_output = bool(indent or wrap) and not result
65 if use_output:
66 output = open(out_file, 'rb').read()
67
68 new_errs = []
69 for err in open(err_file, 'rb').read().splitlines():
70 if (err.find('Error') != -1 or
71 (warnings and err.find('Warning') != -1)):
72 ignore = 0
73 for err_ign in errors_to_ignore or []:
74 if err.find(err_ign) != -1:
75 ignore = 1
76 break
77 if not ignore:
78 new_errs.append(err)
79
80 if new_errs:
81 response.body = wrong_content('<br />'.join(new_errs), orig_body)
82 if response.headers.has_key("Content-Length"):
83
84 del response.headers["Content-Length"]
85 return
86 elif strict_xml:
87
88
89 from elementtree.ElementTree import parse
90 tag_list = ['nbsp', 'quot']
91 for tag in tag_list:
92 orig_body = orig_body.replace('&' + tag + ';', tag.upper())
93
94 if encoding:
95 enctag = '<?xml version="1.0" encoding="%s"?>' % encoding
96 orig_body = enctag + orig_body
97
98 f = StringIO.StringIO(orig_body)
99 try:
100 tree = parse(f)
101 except:
102
103 body_file = StringIO.StringIO()
104 traceback.print_exc(file = body_file)
105 body_file = '<br />'.join(body_file.getvalue())
106 response.body = wrong_content(body_file, orig_body, "XML")
107 if response.headers.has_key("Content-Length"):
108
109 del response.headers["Content-Length"]
110 return
111
112 if use_output:
113 response.body = [output]
114 if response.headers.has_key("Content-Length"):
115
116 del response.headers["Content-Length"]
117
119 """Escape text, replacing space with nbsp and tab with 4 nbsp's."""
120 return cgi.escape(text).replace('\t', ' ').replace(' ', ' ')
121
123 """Escape text, replacing newline with HTML br element."""
124 return cgi.escape(text).replace('\n', '<br />')
125
126 -def wrong_content(header, body, content_type="HTML"):
127 output = ["Wrong %s:<br />%s<br />" % (content_type, html_break(header))]
128 for i, line in enumerate(body.splitlines()):
129 output.append("%03d - %s" % (i + 1, html_space(line)))
130 return "<br />".join(output)
131
132
133 -def nsgmls(temp_dir, nsgmls_path, catalog_path, errors_to_ignore=None):
134 response = cherrypy.response
135
136
137
138 orig_body = response.collapse_body()
139
140 fct = response.headers.get('Content-Type', '')
141 ct = fct.split(';')[0]
142 encoding = ''
143 i = fct.find('charset=')
144 if i != -1:
145 encoding = fct[i + 8:]
146 if ct == 'text/html':
147
148
149
150 while True:
151 i = orig_body.find('<script')
152 if i == -1:
153 break
154 j = orig_body.find('</script>', i)
155 if j == -1:
156 break
157 orig_body = orig_body[:i] + orig_body[j+9:]
158
159 page_file = os.path.join(temp_dir, 'page.html')
160 open(page_file, 'wb').write(orig_body)
161
162 err_file = os.path.join(temp_dir, 'nsgmls.err')
163 command = ('%s -c%s -f%s -s -E10 %s' %
164 (nsgmls_path, catalog_path, err_file, page_file))
165 command = command.replace('\\', '/')
166 os.system(command)
167 errs = open(err_file, 'rb').read()
168
169 new_errs = []
170 for err in errs.splitlines():
171 ignore = False
172 for err_ign in errors_to_ignore or []:
173 if err.find(err_ign) != -1:
174 ignore = True
175 break
176 if not ignore:
177 new_errs.append(err)
178
179 if new_errs:
180 response.body = wrong_content('<br />'.join(new_errs), orig_body)
181 if response.headers.has_key("Content-Length"):
182
183 del response.headers["Content-Length"]
184