summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFélix Sipma <gueux@gueux.org>2012-07-21 17:25:16 +0200
committerFélix Sipma <gueux@gueux.org>2012-07-21 17:25:16 +0200
commiteecbb3ebc89f574211378f1c6811ce2f8b75013f (patch)
treeefe46e55147207048bbe1b4c95966253b101422c
Initial commit.
-rwxr-xr-xpipermail_get_archives.py59
1 files changed, 59 insertions, 0 deletions
diff --git a/pipermail_get_archives.py b/pipermail_get_archives.py
new file mode 100755
index 0000000..40e9da6
--- /dev/null
+++ b/pipermail_get_archives.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+# This program is free software. It comes without any warranty, to the
+# extent permitted by applicable law. You can redistribute it and/or
+# modify it under the terms of the Do What The Fuck You Want To Public
+# License, Version 2, as published by Sam Hocevar.
+# See http://sam.zoy.org/wtfpl/COPYING for more details.
+
+"""
+Download pipermail archives from a given url, uncompress them if necessary, do
+string substitutions so that they could be opened in a mail reader and
+concatenate the result in a big mylistname_mbox.txt file.
+"""
+
+import sys
+import os
+import re
+import urllib
+import BeautifulSoup
+import gzip
+import fileinput
+import tempfile
+import shutil
+
+if len(sys.argv) < 2:
+ sys.exit('Usage: %(name)s pipermail-url\nExample: %(name)s http://lists.example.com/pipermail/mylist/' % { 'name': sys.argv[0] })
+
+url = re.sub(r'/$', '', sys.argv[1]) + '/'
+
+try:
+ archive = re.match(r'^.*/([\w]+)(/)?$', url).group(1) + "_mbox.txt"
+except AttributeError:
+ archive = "pipermail_mbox.txt"
+
+print "INFO: Output file " + archive
+
+dtemp = tempfile.mkdtemp()
+
+farch = open(archive, 'w')
+furl = urllib.urlopen(url)
+data = furl.read()
+soup = BeautifulSoup.BeautifulSoup(data)
+
+linenumber = 0
+
+for a in soup.findAll('a'):
+ if '.txt' in a['href']:
+ print "INFO: GET " + url + a['href']
+ f = os.path.join(dtemp, a['href'])
+ urllib.urlretrieve(url + a['href'], f)
+ for line in fileinput.hook_compressed(f, 'rw'):
+ tmp = re.sub(r'^(From:? .*) (at|en) ', r'\1@', line)
+ tmp2 = re.sub(r'^Date: ([A-Z][a-z][a-z]) +([A-Z][a-z][a-z]) +([0-9]+) +([0-9:]+) +([0-9]+)', r'Date: \1, \3 \2 \5 \4 +0000', tmp)
+ farch.write(tmp2)
+ linenumber = linenumber + 1
+
+shutil.rmtree(dtemp)
+farch.close()
+print "INFO: %s lines written." % linenumber