summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFélix Sipma <felix.sipma@no-log.org>2015-11-06 09:33:57 +0100
committerFélix Sipma <felix.sipma@no-log.org>2015-11-06 09:33:57 +0100
commit2f8fe4cb67ad6fcb61146cee0b20cdcf21fe7f79 (patch)
treec9624842018aa38a25631e6c6a1588d8622c144c
parent99f5a5551b1c9cb3ba7fee80a8f10a2dd299f8c7 (diff)
match only a's with href
-rwxr-xr-xpipermail_get_archives_mbox.py4
1 files changed, 2 insertions, 2 deletions
diff --git a/pipermail_get_archives_mbox.py b/pipermail_get_archives_mbox.py
index cf31f9e..67e1b97 100755
--- a/pipermail_get_archives_mbox.py
+++ b/pipermail_get_archives_mbox.py
@@ -17,7 +17,6 @@ import os
import re
import urllib
import BeautifulSoup
-import gzip
import fileinput
import tempfile
import shutil
@@ -43,12 +42,13 @@ soup = BeautifulSoup.BeautifulSoup(data)
linenumber = 0
-for a in soup.findAll('a'):
+for a in soup.findAll('a', href=True):
if '.txt' in a['href']:
print "INFO: GET " + url + a['href']
f = os.path.join(dtemp, a['href'])
urllib.urlretrieve(url + a['href'], f)
for line in fileinput.hook_compressed(f, 'rw'):
+ # revert email addresses escaping
tmp = re.sub(r'^(From:? .*) (at|en) ', r'\1@', line)
tmp2 = re.sub(r'^Date: ([A-Z][a-z][a-z]) +([A-Z][a-z][a-z]) +([0-9]+) +([0-9:]+) +([0-9]+)', r'Date: \1, \3 \2 \5 \4 +0000', tmp)
farch.write(tmp2)