summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xpipermail_get_archives_mbox150
1 files changed, 125 insertions, 25 deletions
diff --git a/pipermail_get_archives_mbox b/pipermail_get_archives_mbox
index 67e1b97..697510f 100755
--- a/pipermail_get_archives_mbox
+++ b/pipermail_get_archives_mbox
@@ -9,9 +9,10 @@
"""
Download pipermail archives from a given url, uncompress them if necessary,
do string substitutions so that they could be opened in a mail reader and
-concatenate the result in a big mylistname_mbox.txt file.
+concatenate the result in a big mylist.mbox file.
"""
+from __future__ import print_function
import sys
import os
import re
@@ -20,40 +21,139 @@ import BeautifulSoup
import fileinput
import tempfile
import shutil
+import logging
+
+
+helpmsg = """Usage: %(name)s pipermail-url
+Example: %(name)s http://lists.example.com/pipermail/mylist/""" % {'name': sys.argv[0]}
+
+logging.basicConfig()
+logging.getLogger().setLevel(logging.DEBUG)
+logging.disable(logging.INFO - 1)
+logger = logging.getLogger('pipermail_get_archives_mbox')
if len(sys.argv) < 2:
- sys.exit('Usage: %(name)s pipermail-url\nExample: %(name)s http://lists.example.com/pipermail/mylist/' % {'name': sys.argv[0]})
+ logger.error("invalid command line options")
+ sys.exit(helpmsg)
-url = re.sub(r'/$', '', sys.argv[1]) + '/'
+url = sys.argv[1]
+# url = re.sub(r'/$', '', sys.argv[1]) + '/'
-try:
- archive = re.match(r'^.*\/pipermail\/([a-zA-Z0-9-+._]+)', url).group(1) + ".mbox"
-except AttributeError:
- archive = "pipermail.mbox"
+# Used to match the mailing list name
+RE_LIST = '[a-zA-Z0-9-+._]+'
-print "INFO: Output file " + archive
+re_all = r'^.*\/pipermail\/(%s)\/?$' % RE_LIST
+re_message = r'^.*\/pipermail\/(%s)\/([0-9]+)-([a-zA-Z]+)\/([0-9]+)\.html$' % RE_LIST
+re_month = r'^.*\/pipermail\/(%s)\/([0-9]+)-([a-zA-Z]+)(\/.*)?$' % RE_LIST
-dtemp = tempfile.mkdtemp()
+MODE = ""
+archive = ""
+month = ""
+
+# Whole archive mode
+if re.match(re_all, url):
+ r = re.search(re_all, url)
+ archive = r.group(1) + ".mbox"
+ MODE = "all"
+# Single message mode
+elif re.match(re_message, url):
+ r = re.search(re_message, url)
+ archive = r.group(1) + "_" + r.group(4) + ".mbox"
+ month = r.group(2) + "-" + r.group(3)
+ MODE = "message"
+# Single month mode
+elif re.match(re_month, url):
+ r = re.search(re_month, url)
+ archive = r.group(1) + "_" + r.group(2) + "-" + r.group(3) + ".mbox"
+ month = r.group(2) + "-" + r.group(3)
+ MODE = "month"
+# Fail
+else:
+ logger.error("Can't match url " + url)
+ sys.exit(1)
+
+logger.info("Output file " + archive)
-farch = open(archive, 'w')
-furl = urllib.urlopen(url)
+farch = open(archive, 'a+')
+
+dtemp = tempfile.mkdtemp()
+# Open main archive url
+main_url = re.search(r"^(.*\/pipermail\/(%s)\/?)" % RE_LIST, url).group(1)
+logger.info("GET %s" % main_url)
+furl = urllib.urlopen(main_url)
data = furl.read()
soup = BeautifulSoup.BeautifulSoup(data)
-linenumber = 0
+messagenumber = 0
+
+
+def get_month_url(month_url):
+ logger.info("GET %s" % month_url)
+ f = os.path.join(dtemp, month_url)
+ urllib.urlretrieve(main_url + month_url, f)
+ messagenumber = 0
+ for line in fileinput.hook_compressed(f, 'rw'):
+ # test if this line is the beginning of a new message
+ if re.match(r'^From ', line):
+ messagenumber += 1
+ # revert email addresses escaping
+ tmp = re.sub(r'^(From:? .*) (at|en) ', r'\1@', line)
+ tmp2 = re.sub(r'^Date: ([A-Z][a-z][a-z]) +([A-Z][a-z][a-z]) +([0-9]+) +([0-9:]+) +([0-9]+)', r'Date: \1, \3 \2 \5 \4 +0000', tmp)
+ farch.write(tmp2)
+ return messagenumber
+
-for a in soup.findAll('a', href=True):
- if '.txt' in a['href']:
- print "INFO: GET " + url + a['href']
- f = os.path.join(dtemp, a['href'])
- urllib.urlretrieve(url + a['href'], f)
- for line in fileinput.hook_compressed(f, 'rw'):
- # revert email addresses escaping
- tmp = re.sub(r'^(From:? .*) (at|en) ', r'\1@', line)
- tmp2 = re.sub(r'^Date: ([A-Z][a-z][a-z]) +([A-Z][a-z][a-z]) +([0-9]+) +([0-9:]+) +([0-9]+)', r'Date: \1, \3 \2 \5 \4 +0000', tmp)
- farch.write(tmp2)
- linenumber = linenumber + 1
+if MODE == "all":
+ for a in soup.findAll('a', href=re.compile(r'^[0-9]+-[a-zA-Z]+\.txt')):
+ messagenumber = get_month_url(a['href'])
+ farch.close()
+else:
+ month_url = soup.find('a', href=re.compile(r'%s.txt' % month))['href']
+ if MODE == "month":
+ messagenumber = get_month_url(month_url)
+ farch.close()
+ elif MODE == "message":
+ # get Message-ID (In-Reply-To)
+ logger.info("INFO: GET %s" % url)
+ murl = urllib.urlopen(url)
+ mdata = murl.read()
+ msoup = BeautifulSoup.BeautifulSoup(mdata)
+ a = msoup.find('a', href=re.compile(r'.*In-Reply-To=', re.IGNORECASE))
+ if not a:
+ shutil.rmtree(dtemp)
+ logger.error("Could not find message \"In-Reply-To=\" in %s" % url)
+ sys.exit(1)
+ r = re.search(r".*In-Reply-To=((?!;).*)", a['href'])
+ messageid = urllib.unquote(r.group(1))
+ logger.info("INFO: Message-ID: %s" % messageid)
+ messagenumber = get_month_url(month_url)
+ farch.seek(0)
+ # Remove all messages except the one matching messageid
+ fcurrentemailpath = os.path.join(dtemp, "currentemail")
+ fcurrentemail = open(fcurrentemailpath, 'w+')
+ matched = False
+ for line in farch:
+ # new email
+ if re.match(r"^From ", line):
+ if matched:
+ break
+ else:
+ sys.stdout.write("INFO: Message %s" % line)
+ fcurrentemail.seek(0)
+ fcurrentemail.truncate()
+ fcurrentemail.write(line)
+ # matching messageid
+ elif re.match(r"^Message-ID: %s" % messageid, line, re.IGNORECASE):
+ sys.stdout.write("INFO: Matched %s" % line)
+ fcurrentemail.write(line)
+ matched = True
+ # other line
+ else:
+ fcurrentemail.write(line)
+ farch.close()
+ fcurrentemail.close()
+ os.rename(fcurrentemailpath, archive)
+ messagenumber = 1
shutil.rmtree(dtemp)
-farch.close()
-print "INFO: %s lines written." % linenumber
+logger.info("INFO: %s message(s) written." % messagenumber)