Skip to content

Commit 6703cb5

Browse files
daxtensstephenfin
authored andcommitted
parser: parse headers containing invalid characters or codings
If there is a non-ascii character in a header, parsing fails, even on Py27. This has huge Py2/Py3 complexities. The Py3 email package has tools to handle this - we just need to use them. Py2, on the other hand, needs a lot of hand-holding, as explained in the comments. Additionally, support headers that claim an encoding, but fail to decode with that encoding. This is handy for mails with malformed headers containing weird bytes. Signed-off-by: Daniel Axtens <[email protected]> Reported-by: Thomas Monjalon <[email protected]> Reviewed-by: Stephen Finucane <[email protected]> (cherry picked from commit 6625d4b)
1 parent e073ca7 commit 6703cb5

File tree

1 file changed

+82
-17
lines changed

1 file changed

+82
-17
lines changed

patchwork/bin/parsemail.py

Lines changed: 82 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,10 @@
2525
import codecs
2626
import datetime
2727
from email import message_from_file
28-
from email.header import Header, decode_header
28+
from email.header import decode_header, make_header
2929
from email.utils import parsedate_tz, mktime_tz
3030
from fnmatch import fnmatch
31-
from functools import reduce
3231
import logging
33-
import operator
3432
import re
3533
import sys
3634

@@ -39,7 +37,6 @@
3937
from django.contrib.auth.models import User
4038
from django.utils.log import AdminEmailHandler
4139
from django.utils import six
42-
from django.utils.six.moves import map
4340

4441
from patchwork.models import (Patch, Project, Person, Comment, State,
4542
DelegationRule, get_default_initial_patch_state)
@@ -63,19 +60,84 @@ def normalise_space(str):
6360
return whitespace_re.sub(' ', str).strip()
6461

6562

63+
def sanitise_header(header_contents, header_name=None):
64+
"""Clean and individual mail header.
65+
66+
Given a header with header_contents, optionally labelled
67+
header_name, decode it with decode_header, sanitise it to make
68+
sure it decodes correctly and contains no invalid characters,
69+
then encode the result with make_header()
70+
"""
71+
72+
# We have some Py2/Py3 issues here.
73+
#
74+
# Firstly, the email parser (before we get here)
75+
# Python 3: headers with weird chars are email.header.Header
76+
# class, others as str
77+
# Python 2: every header is an str
78+
#
79+
# Secondly, the behaviour of decode_header:
80+
# Python 3: weird headers are labelled as unknown-8bit
81+
# Python 2: weird headers are not labelled differently
82+
#
83+
# Lastly, aking matters worse, in Python2, unknown-8bit doesn't
84+
# seem to be supported as an input to make_header, so not only do
85+
# we have to detect dodgy headers, we have to fix them ourselves.
86+
#
87+
# We solve this by catching any Unicode errors, and then manually
88+
# handling any interesting headers.
89+
90+
value = decode_header(header_contents)
91+
try:
92+
header = make_header(value,
93+
header_name=header_name,
94+
continuation_ws='\t')
95+
except UnicodeDecodeError:
96+
# At least one of the parts cannot be encoded as ascii.
97+
# Find out which one and fix it somehow.
98+
#
99+
# We get here under Py2 when there's non-7-bit chars in header,
100+
# or under Py2 or Py3 where decoding with the coding hint fails.
101+
102+
new_value = []
103+
104+
for (part, coding) in value:
105+
# We have random bytes that aren't properly coded.
106+
# If we had a coding hint, it failed to help.
107+
if six.PY3:
108+
# python3 - force coding to unknown-8bit
109+
new_value += [(part, 'unknown-8bit')]
110+
else:
111+
# python2 - no support in make_header for unknown-8bit
112+
# We should do unknown-8bit coding ourselves.
113+
# For now, we're just going to replace any dubious
114+
# chars with ?.
115+
#
116+
# TODO: replace it with a proper QP unknown-8bit codec.
117+
new_value += [(part.decode('ascii', errors='replace')
118+
.encode('ascii', errors='replace'),
119+
None)]
120+
121+
header = make_header(new_value,
122+
header_name=header_name,
123+
continuation_ws='\t')
124+
125+
return header
126+
127+
66128
def clean_header(header):
67129
"""Decode (possibly non-ascii) headers."""
68-
def decode(fragment):
69-
(frag_str, frag_encoding) = fragment
70-
if frag_encoding:
71-
return frag_str.decode(frag_encoding)
72-
elif isinstance(frag_str, six.binary_type): # python 2
73-
return frag_str.decode()
74-
return frag_str
75130

76-
fragments = list(map(decode, decode_header(header)))
131+
sane_header = sanitise_header(header)
77132

78-
return normalise_space(u' '.join(fragments))
133+
# on Py2, we want to do unicode(), on Py3, str().
134+
# That gets us the decoded, un-wrapped header.
135+
if six.PY2:
136+
header_str = unicode(sane_header)
137+
else:
138+
header_str = str(sane_header)
139+
140+
return normalise_space(header_str)
79141

80142

81143
def find_project_by_id(list_id):
@@ -168,10 +230,13 @@ def mail_date(mail):
168230

169231

170232
def mail_headers(mail):
171-
return reduce(operator.__concat__,
172-
['%s: %s\n' % (k, Header(v, header_name=k,
173-
continuation_ws='\t').encode())
174-
for (k, v) in list(mail.items())])
233+
headers = [(key, sanitise_header(value, header_name=key))
234+
for key, value in mail.items()]
235+
236+
strings = [('%s: %s' % (key, header.encode()))
237+
for (key, header) in headers]
238+
239+
return '\n'.join(strings)
175240

176241

177242
def find_pull_request(content):

0 commit comments

Comments
 (0)