summaryrefslogtreecommitdiff
path: root/pipermail_get_archives_mbox
blob: 697510f814bc15145725e98c67dc8a2c3077567d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python

# This program is free software. It comes without any warranty, to the
# extent permitted by applicable law. You can redistribute it and/or
# modify it under the terms of the Do What The Fuck You Want To Public
# License, Version 2, as published by Sam Hocevar.
# See http://sam.zoy.org/wtfpl/COPYING for more details.

"""
Download pipermail archives from a given url, uncompress them if necessary,
do string substitutions so that they could be opened in a mail reader and
concatenate the result in a big mylist.mbox file.
"""

from __future__ import print_function
import sys
import os
import re
import urllib
import BeautifulSoup
import fileinput
import tempfile
import shutil
import logging


helpmsg = """Usage: %(name)s pipermail-url
Example: %(name)s http://lists.example.com/pipermail/mylist/""" % {'name': sys.argv[0]}

logging.basicConfig()
logging.getLogger().setLevel(logging.DEBUG)
logging.disable(logging.INFO - 1)
logger = logging.getLogger('pipermail_get_archives_mbox')

if len(sys.argv) < 2:
    logger.error("invalid command line options")
    sys.exit(helpmsg)

url = sys.argv[1]
# url = re.sub(r'/$', '', sys.argv[1]) + '/'

# Used to match the mailing list name
RE_LIST = '[a-zA-Z0-9-+._]+'

re_all = r'^.*\/pipermail\/(%s)\/?$' % RE_LIST
re_message = r'^.*\/pipermail\/(%s)\/([0-9]+)-([a-zA-Z]+)\/([0-9]+)\.html$' % RE_LIST
re_month = r'^.*\/pipermail\/(%s)\/([0-9]+)-([a-zA-Z]+)(\/.*)?$' % RE_LIST

MODE = ""
archive = ""
month = ""

# Whole archive mode
if re.match(re_all, url):
    r = re.search(re_all, url)
    archive = r.group(1) + ".mbox"
    MODE = "all"
# Single message mode
elif re.match(re_message, url):
    r = re.search(re_message, url)
    archive = r.group(1) + "_" + r.group(4) + ".mbox"
    month = r.group(2) + "-" + r.group(3)
    MODE = "message"
# Single month mode
elif re.match(re_month, url):
    r = re.search(re_month, url)
    archive = r.group(1) + "_" + r.group(2) + "-" + r.group(3) + ".mbox"
    month = r.group(2) + "-" + r.group(3)
    MODE = "month"
# Fail
else:
    logger.error("Can't match url " + url)
    sys.exit(1)

logger.info("Output file " + archive)

farch = open(archive, 'a+')

dtemp = tempfile.mkdtemp()
# Open main archive url
main_url = re.search(r"^(.*\/pipermail\/(%s)\/?)" % RE_LIST, url).group(1)
logger.info("GET %s" % main_url)
furl = urllib.urlopen(main_url)
data = furl.read()
soup = BeautifulSoup.BeautifulSoup(data)

messagenumber = 0


def get_month_url(month_url):
    logger.info("GET %s" % month_url)
    f = os.path.join(dtemp, month_url)
    urllib.urlretrieve(main_url + month_url, f)
    messagenumber = 0
    for line in fileinput.hook_compressed(f, 'rw'):
        # test if this line is the beginning of a new message
        if re.match(r'^From ', line):
            messagenumber += 1
        # revert email addresses escaping
        tmp = re.sub(r'^(From:? .*) (at|en) ', r'\1@', line)
        tmp2 = re.sub(r'^Date: ([A-Z][a-z][a-z]) +([A-Z][a-z][a-z]) +([0-9]+) +([0-9:]+) +([0-9]+)', r'Date: \1, \3 \2 \5 \4 +0000', tmp)
        farch.write(tmp2)
    return messagenumber


if MODE == "all":
    for a in soup.findAll('a', href=re.compile(r'^[0-9]+-[a-zA-Z]+\.txt')):
        messagenumber = get_month_url(a['href'])
        farch.close()
else:
    month_url = soup.find('a', href=re.compile(r'%s.txt' % month))['href']
    if MODE == "month":
        messagenumber = get_month_url(month_url)
        farch.close()
    elif MODE == "message":
        # get Message-ID (In-Reply-To)
        logger.info("INFO: GET %s" % url)
        murl = urllib.urlopen(url)
        mdata = murl.read()
        msoup = BeautifulSoup.BeautifulSoup(mdata)
        a = msoup.find('a', href=re.compile(r'.*In-Reply-To=', re.IGNORECASE))
        if not a:
            shutil.rmtree(dtemp)
            logger.error("Could not find message \"In-Reply-To=\" in %s" % url)
            sys.exit(1)
        r = re.search(r".*In-Reply-To=((?!;).*)", a['href'])
        messageid = urllib.unquote(r.group(1))
        logger.info("INFO: Message-ID: %s" % messageid)
        messagenumber = get_month_url(month_url)
        farch.seek(0)
        # Remove all messages except the one matching messageid
        fcurrentemailpath = os.path.join(dtemp, "currentemail")
        fcurrentemail = open(fcurrentemailpath, 'w+')
        matched = False
        for line in farch:
            # new email
            if re.match(r"^From ", line):
                if matched:
                    break
                else:
                    sys.stdout.write("INFO: Message %s" % line)
                    fcurrentemail.seek(0)
                    fcurrentemail.truncate()
                    fcurrentemail.write(line)
            # matching messageid
            elif re.match(r"^Message-ID: %s" % messageid, line, re.IGNORECASE):
                sys.stdout.write("INFO: Matched %s" % line)
                fcurrentemail.write(line)
                matched = True
            # other line
            else:
                fcurrentemail.write(line)
        farch.close()
        fcurrentemail.close()
        os.rename(fcurrentemailpath, archive)
        messagenumber = 1

shutil.rmtree(dtemp)
logger.info("INFO: %s message(s) written." % messagenumber)