summaryrefslogtreecommitdiff
path: root/pipermail_get_archives_mbox
blob: 67e1b9730ca01f752d02bac907802e376281b25b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python

# This program is free software. It comes without any warranty, to the
# extent permitted by applicable law. You can redistribute it and/or
# modify it under the terms of the Do What The Fuck You Want To Public
# License, Version 2, as published by Sam Hocevar.
# See http://sam.zoy.org/wtfpl/COPYING for more details.

"""
Download pipermail archives from a given url, uncompress them if necessary,
do string substitutions so that they could be opened in a mail reader and
concatenate the result in a big mylistname_mbox.txt file.
"""

import sys
import os
import re
import urllib
import BeautifulSoup
import fileinput
import tempfile
import shutil

if len(sys.argv) < 2:
    sys.exit('Usage: %(name)s pipermail-url\nExample: %(name)s http://lists.example.com/pipermail/mylist/' % {'name': sys.argv[0]})

url = re.sub(r'/$', '', sys.argv[1]) + '/'

try:
    archive = re.match(r'^.*\/pipermail\/([a-zA-Z0-9-+._]+)', url).group(1) + ".mbox"
except AttributeError:
    archive = "pipermail.mbox"

print "INFO: Output file " + archive

dtemp = tempfile.mkdtemp()

farch = open(archive, 'w')
furl = urllib.urlopen(url)
data = furl.read()
soup = BeautifulSoup.BeautifulSoup(data)

linenumber = 0

for a in soup.findAll('a', href=True):
    if '.txt' in a['href']:
        print "INFO: GET " + url + a['href']
        f = os.path.join(dtemp, a['href'])
        urllib.urlretrieve(url + a['href'], f)
        for line in fileinput.hook_compressed(f, 'rw'):
            # revert email addresses escaping
            tmp = re.sub(r'^(From:? .*) (at|en) ', r'\1@', line)
            tmp2 = re.sub(r'^Date: ([A-Z][a-z][a-z]) +([A-Z][a-z][a-z]) +([0-9]+) +([0-9:]+) +([0-9]+)', r'Date: \1, \3 \2 \5 \4 +0000', tmp)
            farch.write(tmp2)
            linenumber = linenumber + 1

shutil.rmtree(dtemp)
farch.close()
print "INFO: %s lines written." % linenumber