aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFélix Sipma <felix.sipma@no-log.org>2018-08-17 12:06:08 +0200
committerFélix Sipma <felix.sipma@no-log.org>2018-08-17 12:33:09 +0200
commit5bea27575ab56bd9130c9b0a3503e9f8ebd592b2 (patch)
treef70b314e228509d334569a665eded64d9b1d6e55
parent056e0dc4ab30cfa2f14d2e3b1d6ba12c946d9482 (diff)
add ddt-sdrea.py filter
-rwxr-xr-xdoc/filters/ddt-sdrea.py146
1 files changed, 146 insertions, 0 deletions
diff --git a/doc/filters/ddt-sdrea.py b/doc/filters/ddt-sdrea.py
new file mode 100755
index 0000000..58959c6
--- /dev/null
+++ b/doc/filters/ddt-sdrea.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim: ft=python et softtabstop=4 cinoptions=4 shiftwidth=4 ts=4 ai
+"""
+USAGE
+ ddt-sdrea.py [OPTION] URL
+
+OPTIONS
+ -t, --title=TEXT
+ Updates feed title.
+ Default is page title
+ -d, --description=TEXT
+ Updates feed description.
+ Default is ""
+ -v, --version
+ Print version and exit.
+ -h, --help
+ Show this help and exit.
+
+EXAMPLES
+ ddt-sdrea.py -t "Publicité des demandes d'autorisation d'exploiter Loire" -d "Test" \
+"http://www.loire.gouv.fr/publicite-des-demandes-d-autorisation-d-exploiter-a6497.html"
+"""
+
+from __future__ import print_function
+from bs4 import BeautifulSoup
+import PyRSS2Gen
+import sys
+import getopt
+import logging
+from dateutil.parser import parse, parserinfo
+import datetime
+import requests
+import subprocess
+
+__version__ = "0.1"
+
+# Defaults
+feed_title = None
+feed_description = None
+months = [
+ ('Janvier', 'Jan', 'Janv', ),
+ ('Février', 'Fév', 'Fev', 'f\xe9vrier', 'f\xe9v', ),
+ ('Mars', 'Mar', ),
+ ('Avril', 'Avr', ),
+ ('Mai', ),
+ ('Juin', ),
+ ('Juillet', 'Juil', ),
+ ('Août', 'Aout', 'ao\xfbt', ),
+ ('Septembre', 'Sept', ),
+ ('Octobre', 'Oct', ),
+ ('Novembre', 'Nov', ),
+ ('Décembre', 'Déc', 'Dec', 'd\xe9cembre', 'd\xe9c', )
+]
+
+# Logging
+logging.basicConfig()
+logging.getLogger().setLevel(logging.DEBUG)
+logging.disable(logging.INFO - 1)
+logger = logging.getLogger('ddt-sdrea')
+
+# Parse args
+try:
+ optlist, args = getopt.getopt(sys.argv[1:], 'u:t:d:hv',
+ ['help', 'version', 'title=',
+ 'description='])
+except getopt.GetoptError as msg:
+ logger.error("%s\n\n%s", msg, __doc__)
+ sys.exit(2)
+
+for opt in optlist:
+ if opt[0] in ('-t', '--title'):
+ feed_title = opt[1]
+ if opt[0] in ('-d', '--description'):
+ feed_description = opt[1]
+ elif opt[0] in ('-v', '--version'):
+ print(__version__)
+ sys.exit(0)
+ elif opt[0] in ('-h', '--help'):
+ print(__doc__)
+ sys.exit(0)
+
+html = sys.stdin.read()
+
+# Find entries
+soup = BeautifulSoup(html, "lxml")
+
+# region = soup.find(class_="entete").attrs.get('title')
+url = soup.head.find("meta", {"name": "dcterms.identifier"}).attrs.get("content")
+base_url = url.rsplit('/', 1)[0]
+feed_date = soup.head.find("meta", {"name": "dcterms.modified"}).attrs.get("content")
+page_title = soup.head.find("title").text.strip()
+
+if not feed_title:
+ feed_title = page_title
+
+entries = soup.body.find("div", {"class": "texte"}).findAll('tr')
+
+# Build rss items based on entries
+items = []
+
+today = datetime.date.today()
+one_day = datetime.timedelta(days=1)
+yesterday = today - one_day
+
+class MyParserInfo(parserinfo):
+ MONTHS = [parserinfo.MONTHS[x] + months[x] for x in range(12)]
+
+for entry in entries:
+ title = entry.find("td").text.strip()
+ if not title:
+ logger.error("Can't find title for entry.")
+ date = parse(entry.find("th").text.strip(), dayfirst=True, fuzzy=True, parserinfo=MyParserInfo())
+ if not date:
+ logger.error("Can't find date for entry %s.", title)
+ link = entry.find("a").attrs.get("href")
+ if not link:
+ logger.error("Can't find link for entry %s.", title)
+ pdf = requests.get(base_url + "/" + link)
+ p = subprocess.Popen(['pdftotext', '-q', '-layout', '-', '-'],
+ stdin=subprocess.PIPE,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+ content = "<pre>" + p.communicate(input=pdf.content)[0] + "</pre>"
+ items.append(
+ PyRSS2Gen.RSSItem(
+ title=title,
+ #link=link,
+ #categories=[category],
+ description=content,
+ pubDate=date,
+ )
+ )
+
+# Build rss feed
+rss = PyRSS2Gen.RSS2(
+ #title=feed_title or soup.body.find(class_="sTitrePage").text or "",
+ title=feed_title,
+ link=url,
+ description=feed_description or "",
+ lastBuildDate = parse(feed_date),
+ items=items)
+
+# Return rss feed
+print(rss.to_xml(encoding='utf-8'))
+# print(BeautifulSoup(rss.to_xml(), "lxml").prettify())