diff options
author | Félix Sipma <felix.sipma@no-log.org> | 2015-11-06 13:32:09 +0100 |
---|---|---|
committer | Félix Sipma <felix.sipma@no-log.org> | 2015-11-06 13:39:49 +0100 |
commit | 2a427826cf21e9c2f3b59b83114ad0350250f9a1 (patch) | |
tree | a1242687f9542fee5b5f241b8b4e20be6c4c8981 | |
parent | e5845d950ab845f764ded4607ec43af98e961c7a (diff) |
-rwxr-xr-x | pipermail_get_archives_mbox | 150 |
1 files changed, 125 insertions, 25 deletions
diff --git a/pipermail_get_archives_mbox b/pipermail_get_archives_mbox index 67e1b97..697510f 100755 --- a/pipermail_get_archives_mbox +++ b/pipermail_get_archives_mbox @@ -9,9 +9,10 @@ """ Download pipermail archives from a given url, uncompress them if necessary, do string substitutions so that they could be opened in a mail reader and -concatenate the result in a big mylistname_mbox.txt file. +concatenate the result in a big mylist.mbox file. """ +from __future__ import print_function import sys import os import re @@ -20,40 +21,139 @@ import BeautifulSoup import fileinput import tempfile import shutil +import logging + + +helpmsg = """Usage: %(name)s pipermail-url +Example: %(name)s http://lists.example.com/pipermail/mylist/""" % {'name': sys.argv[0]} + +logging.basicConfig() +logging.getLogger().setLevel(logging.DEBUG) +logging.disable(logging.INFO - 1) +logger = logging.getLogger('pipermail_get_archives_mbox') if len(sys.argv) < 2: - sys.exit('Usage: %(name)s pipermail-url\nExample: %(name)s http://lists.example.com/pipermail/mylist/' % {'name': sys.argv[0]}) + logger.error("invalid command line options") + sys.exit(helpmsg) -url = re.sub(r'/$', '', sys.argv[1]) + '/' +url = sys.argv[1] +# url = re.sub(r'/$', '', sys.argv[1]) + '/' -try: - archive = re.match(r'^.*\/pipermail\/([a-zA-Z0-9-+._]+)', url).group(1) + ".mbox" -except AttributeError: - archive = "pipermail.mbox" +# Used to match the mailing list name +RE_LIST = '[a-zA-Z0-9-+._]+' -print "INFO: Output file " + archive +re_all = r'^.*\/pipermail\/(%s)\/?$' % RE_LIST +re_message = r'^.*\/pipermail\/(%s)\/([0-9]+)-([a-zA-Z]+)\/([0-9]+)\.html$' % RE_LIST +re_month = r'^.*\/pipermail\/(%s)\/([0-9]+)-([a-zA-Z]+)(\/.*)?$' % RE_LIST -dtemp = tempfile.mkdtemp() +MODE = "" +archive = "" +month = "" + +# Whole archive mode +if re.match(re_all, url): + r = re.search(re_all, url) + archive = r.group(1) + ".mbox" + MODE = "all" +# Single message mode +elif re.match(re_message, url): + r = re.search(re_message, url) + archive = r.group(1) + "_" + r.group(4) + ".mbox" + month = r.group(2) + "-" + r.group(3) + MODE = "message" +# Single month mode +elif re.match(re_month, url): + r = re.search(re_month, url) + archive = r.group(1) + "_" + r.group(2) + "-" + r.group(3) + ".mbox" + month = r.group(2) + "-" + r.group(3) + MODE = "month" +# Fail +else: + logger.error("Can't match url " + url) + sys.exit(1) + +logger.info("Output file " + archive) -farch = open(archive, 'w') -furl = urllib.urlopen(url) +farch = open(archive, 'a+') + +dtemp = tempfile.mkdtemp() +# Open main archive url +main_url = re.search(r"^(.*\/pipermail\/(%s)\/?)" % RE_LIST, url).group(1) +logger.info("GET %s" % main_url) +furl = urllib.urlopen(main_url) data = furl.read() soup = BeautifulSoup.BeautifulSoup(data) -linenumber = 0 +messagenumber = 0 + + +def get_month_url(month_url): + logger.info("GET %s" % month_url) + f = os.path.join(dtemp, month_url) + urllib.urlretrieve(main_url + month_url, f) + messagenumber = 0 + for line in fileinput.hook_compressed(f, 'rw'): + # test if this line is the beginning of a new message + if re.match(r'^From ', line): + messagenumber += 1 + # revert email addresses escaping + tmp = re.sub(r'^(From:? .*) (at|en) ', r'\1@', line) + tmp2 = re.sub(r'^Date: ([A-Z][a-z][a-z]) +([A-Z][a-z][a-z]) +([0-9]+) +([0-9:]+) +([0-9]+)', r'Date: \1, \3 \2 \5 \4 +0000', tmp) + farch.write(tmp2) + return messagenumber + -for a in soup.findAll('a', href=True): - if '.txt' in a['href']: - print "INFO: GET " + url + a['href'] - f = os.path.join(dtemp, a['href']) - urllib.urlretrieve(url + a['href'], f) - for line in fileinput.hook_compressed(f, 'rw'): - # revert email addresses escaping - tmp = re.sub(r'^(From:? .*) (at|en) ', r'\1@', line) - tmp2 = re.sub(r'^Date: ([A-Z][a-z][a-z]) +([A-Z][a-z][a-z]) +([0-9]+) +([0-9:]+) +([0-9]+)', r'Date: \1, \3 \2 \5 \4 +0000', tmp) - farch.write(tmp2) - linenumber = linenumber + 1 +if MODE == "all": + for a in soup.findAll('a', href=re.compile(r'^[0-9]+-[a-zA-Z]+\.txt')): + messagenumber = get_month_url(a['href']) + farch.close() +else: + month_url = soup.find('a', href=re.compile(r'%s.txt' % month))['href'] + if MODE == "month": + messagenumber = get_month_url(month_url) + farch.close() + elif MODE == "message": + # get Message-ID (In-Reply-To) + logger.info("INFO: GET %s" % url) + murl = urllib.urlopen(url) + mdata = murl.read() + msoup = BeautifulSoup.BeautifulSoup(mdata) + a = msoup.find('a', href=re.compile(r'.*In-Reply-To=', re.IGNORECASE)) + if not a: + shutil.rmtree(dtemp) + logger.error("Could not find message \"In-Reply-To=\" in %s" % url) + sys.exit(1) + r = re.search(r".*In-Reply-To=((?!;).*)", a['href']) + messageid = urllib.unquote(r.group(1)) + logger.info("INFO: Message-ID: %s" % messageid) + messagenumber = get_month_url(month_url) + farch.seek(0) + # Remove all messages except the one matching messageid + fcurrentemailpath = os.path.join(dtemp, "currentemail") + fcurrentemail = open(fcurrentemailpath, 'w+') + matched = False + for line in farch: + # new email + if re.match(r"^From ", line): + if matched: + break + else: + sys.stdout.write("INFO: Message %s" % line) + fcurrentemail.seek(0) + fcurrentemail.truncate() + fcurrentemail.write(line) + # matching messageid + elif re.match(r"^Message-ID: %s" % messageid, line, re.IGNORECASE): + sys.stdout.write("INFO: Matched %s" % line) + fcurrentemail.write(line) + matched = True + # other line + else: + fcurrentemail.write(line) + farch.close() + fcurrentemail.close() + os.rename(fcurrentemailpath, archive) + messagenumber = 1 shutil.rmtree(dtemp) -farch.close() -print "INFO: %s lines written." % linenumber +logger.info("INFO: %s message(s) written." % messagenumber) |