aboutsummaryrefslogtreecommitdiff
path: root/doc/filters/recupe.py
blob: 403b5c1f1afbe36dcf7f71a6f02f0a48417882f9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: ft=python et softtabstop=4 cinoptions=4 shiftwidth=4 ts=4 ai
"""
USAGE
    curl URL | recupe.py [OPTION]

OPTIONS
    -t, --title=TEXT
        Updates feed title.
        Default is page title
    -d, --description=TEXT
        Updates feed description.
        Default is ""
    -v, --version
        Print version and exit.
    -h, --help
        Show this help and exit.
"""

from __future__ import print_function
from bs4 import BeautifulSoup
import PyRSS2Gen
import sys
import getopt
import logging
import re
import datetime

__version__ = "0.2"

# Defaults
feed_title = None
feed_description = None
base_url = "http://recupe.net/"

# Logging
logging.basicConfig()
logging.getLogger().setLevel(logging.DEBUG)
logging.disable(logging.INFO - 1)
logger = logging.getLogger('recupe')

# Parse args
try:
    optlist, args = getopt.getopt(
        sys.argv[1:],
        'u:t:d:hv',
        ['help', 'version', 'title=', 'description=']
    )
except getopt.GetoptError as msg:
    logger.error("%s\n\n%s", msg, __doc__)
    sys.exit(2)

for opt in optlist:
    if opt[0] in ('-t', '--title'):
        feed_title = opt[1]
    if opt[0] in ('-d', '--description'):
        feed_description = opt[1]
    elif opt[0] in ('-v', '--version'):
        print(__version__)
        sys.exit(0)
    elif opt[0] in ('-h', '--help'):
        print(__doc__)
        sys.exit(0)

html = sys.stdin.buffer.read()

# Find entries
soup = BeautifulSoup(html, "lxml")

page_title = soup.head.title.text

if not feed_title:
    feed_title = "Recupe.net - " + page_title

entries = soup.body.find(
    "div",
    id="contenu"
).table.find_all(
    "tr",
    {"class": ["ligne_normal", "ligne_bleu"]}
)

# Build rss items based on entries
items = []

for entry in entries:
    city = entry.find("td", {"class": "ville"}).a.text
    tds = entry.find_all("td")
    title = tds[3].text
    link = base_url + tds[3].a.attrs.get("href")
    rawdate = tds[1].text
    redate = re.compile(r"(\d+)/(\d+)/(\d+)")
    rematch = redate.findall(rawdate)
    if rematch:
        redate = rematch[0]
    items.append(
        PyRSS2Gen.RSSItem(
            title=title + " (" + city + ")",
            link=link,
            categories=[],
            description=title + " (" + city + ") ",
            pubDate=datetime.datetime(int(redate[2]), int(redate[1]),
                                      int(redate[0])),
        )
    )

# Build rss feed
rss = PyRSS2Gen.RSS2(
    title=feed_title or soup.body.find(class_="sTitrePage").text or "",
    link=None,
    description=feed_description or "",
    # lastBuildDate = datetime.datetime.now(),
    items=items)

# Return rss feed
print(rss.to_xml())
# souprss = BeautifulSoup(rss.to_xml())