summaryrefslogtreecommitdiff
path: root/webdiff
blob: dedbfda5800c7ca8b0ffc352c6ae9bde60916600 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
#!/usr/bin/env python
"""
webdiff: Get diff of web pages over time
"""
# v0.1
#
# Dependencies: python-sqlalchemy
#
# This program is free software. It comes without any warranty, to the
# extent permitted by applicable law. You can redistribute it and/or
# modify it under the terms of the Do What The Fuck You Want To Public
# License, Version 2, as published by Sam Hocevar.
# See http://sam.zoy.org/wtfpl/COPYING for more details.

from __future__ import unicode_literals
from __future__ import print_function
import sys
from subprocess import Popen, PIPE
import difflib
import datetime
from sqlalchemy import schema, types, engine, orm
import os
import re
import smtplib
import ssl
import socket
import argparse
try:
    from email.mime.text import MIMEText
except ImportError:  # python2
    from email.MIMEText import MIMEText
try:
    import configparser
except ImportError:  # python2
    import ConfigParser as configparser
try:
    from urllib.request import build_opener
except ImportError:  # python2
    from urllib2 import build_opener
try:
    from urllib.error import URLError, HTTPError
except ImportError:  # python2
    from urllib2 import URLError, HTTPError


# CONFIGURATION #################################################
CONF_FILE = []
CONF_FILE.append(os.path.join(os.sep, 'etc', 'webdiff'))
xdgconf = os.path.join(
    os.getenv('XDG_CONFIG_HOME',
              os.path.join(os.path.expanduser('~'), '.config')),
    'webdiff'
)
CONF_FILE.append(xdgconf)
DATA_DIR = os.path.join(
    os.getenv('XDG_DATA_HOME',
              os.path.join(os.path.expanduser('~'), '.local/share')),
    'webdiff'
)
DB_FILE = os.path.join(DATA_DIR, 'webdiff.db')
confset = {
    'db_engine': 'sqlite:///%s' % DB_FILE,
    'output': 'stdout',
    'smtp_server': 'localhost',
    'smtp_port': '25',
    'smtp_from': 'webdiff@localhost',
    'smtp_to': 'webdiff@localhost',
    'interval': '10',
    'block_skip': 'False',
    'format_html_command': '/usr/bin/html2text',
    'timeout': '60',
    'user-agent': 'webdiff/1.0',
}

#################################################################

help = """\
Default operation is to read a config file ($XDG_CONFIG_HOME/webdiff,
/etc/webdiff, in that order) in .ini format and then to check for modifications
in each webpage specified there. The config file may contain a [general]
section then one arbitrary named section for each webpage, containing at least
an url field. All the other fiels are optional.

[general] section:
    db_engine (default: db_engine=sqlite:///$XDG_DATA_HOME/webdiff/webdiff.db)
    output: email|stdout (default: output=stdout)
    smtp_server: SMTP server (default: smtp_server=localhost)
    smtp_port: SMTP server port (default smtp_port=25)
    smtp_from: SMTP From: (default: smtp_from=webdiff@localhost)
    smtp_to: SMTP To: (default: smtp_to=webdiff@localhost)
    interval: interval in minutes (default: interval=10)
    block_skip: skip content inside of the content block (True) or outside of
        the content block (False) (default: block_skip=False)
    format_html_command: command to format html content before diffing it.
        format_html_command must take its input from stdin and output to stdout
        (default: format_html_command=/usr/bin/html2text)
    timeout: webpage check timeout in seconds (default: timeout=60)
    user-agent: webpage check User-Agent (default: user-agent=webdiff/1.0)

[webpage] section:
    url: url to check (required)
    data: use POST instead of GET with data (default: ignored)
    interval (default: see [general] section)
    block_from: first match (after applying format_html_command) of block_from
        will delimit the beginning of the content block (default: ignored)
    block_to: last match (after applying format_html_command) of block_to will
        delimit the end of the content block (default: ignored)
    block_skip (default: see [general] section)
    format_html_command (default: see [general] section)
"""
parser = argparse.ArgumentParser(
    formatter_class=argparse.RawDescriptionHelpFormatter,
    description='webdiff: get diff of webpages over time',
    epilog=help.format(os.path.basename(sys.argv[0])))
args = parser.parse_args()

conffile = configparser.ConfigParser()

try:
    conffile.read(CONF_FILE)
except configparser.Error as e:
    sys.stderr.write('Unable to parse configuration file "%s".\n' % CONF_FILE)
    sys.stderr.write(str(e))
    sys.exit(3)

for key in confset.keys():
    if conffile.has_option('general', key):
        confset[key] = conffile.get('general', key)

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)


def now():
    return datetime.datetime.now()

metadata = schema.MetaData()
page_table = schema.Table(
    'page', metadata,
    schema.Column('page_id', types.Integer, primary_key=True),
    schema.Column('section', types.String),
    schema.Column('url', types.String),
    schema.Column('content', types.PickleType(protocol=2)),  # python2
    schema.Column('last_check', types.DateTime(), default=now),
    schema.Column('last_updated', types.DateTime(), default=now),
)

global_user_agent = confset['user-agent']
opener = build_opener()
opener.addheaders = [('User-agent', global_user_agent)]

engine = engine.create_engine(confset['db_engine'])
metadata.bind = engine
metadata.create_all(checkfirst=True)


class Page(object):
    """Page object"""
    pass

orm.mapper(Page, page_table)

sm = orm.sessionmaker(bind=engine, autoflush=True, autocommit=False,
                      expire_on_commit=True)
session = orm.scoped_session(sm)


def partition(l, cond):
    l1 = l
    l2 = []
    while True:
        if len(l1) > 0:
            if cond(l1[0]):
                return (l2, [l1[0]], l1[1:])
            else:
                l2 = l2 + l1[:1]
                l1 = l1[1:]
        else:
            return (l2, [], l1)


def re_search(reblock, x, default=False):
    if reblock is None:
        return default
    else:
        return re.search(reblock, x)


def block_filter(content, block_from, block_to, block_skip):
    """
    block filter
    """
    if block_from or block_to:
        if block_skip:
            b, f, block = partition(content,
                                    lambda x: re_search(block_from, x, True))
            revblock, t, r = partition(block[::-1] + f,
                                       lambda x: re_search(block_to, x, False))
            if len(r) == 0:
                if len(t) == 0:
                    revblock = []
                else:
                    revblock = revblock[:-1]
            return b + revblock[::-1]
        else:
            _, f, block = partition(content,
                                    lambda x: re_search(block_from, x, True))
            block, t, _ = partition(block,
                                    lambda x: re_search(block_to, x, False))
            return f + block + t
    else:
        return content


def check_url(url, data, timeout, block_from, block_to, block_skip,
              format_html):
    """
    Returns content and last check datetime of a given url.
    """
    retry = 1
    html_content = ""
    while True:
        try:
            response = opener.open(url, data=data, timeout=timeout)
            html_content = response.read()
        except Exception as error:
            if retry >= 3:
                if error == HTTPError:
                    html_content = ('HTTP code: %s - %s' % (error.code, error.msg)).encode()
                elif error == ssl.SSLError:
                    html_content = ('SSL Error: %s' % error).encode()
                elif error == URLError:
                    html_content = ('URL Error: %s' % error.reason).encode()
                elif error == socket.error:
                    html_content = ('Socket Error: %s' % error).encode()
                elif error == socket.timeout:
                    html_content = ('Socket Timeout: %s' % error).encode()
                else:
                    html_content = ('Error: %s' % error).encode()
                break
            retry += 1
        break
    p = Popen(format_html, stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True)
    out, err = p.communicate(input=html_content)
    if err:
        prefix = "WARNING (%s): " % format_html
        print(prefix + ("\n" + prefix).join(err.split('\n')), file=sys.stderr)
    content = out.decode('utf-8', errors='replace').splitlines()
    content = block_filter(content, block_from, block_to, block_skip)
    last_check = now()
    return (content, last_check)


def diff(old_list, new_list, old_title, new_title):
    """
    diff function
    """
    return difflib.unified_diff(old_list, new_list, old_title, new_title)


def check_page(section, url, data, timeout, interval, block_from, block_to,
               block_skip, format_html):
    """
    call check_url if interval is exceed.
    """
    query = session.query(Page).filter_by(section=section)
    output = None
    if query.all() == []:
        content, last_check = check_url(url, data, timeout, block_from,
                                        block_to, block_skip, format_html)
        page = Page()
        page.section = section
        page.url = url
        page.content = content
        page.last_check = last_check
        page.last_updated = last_check
        session.add(page)
        session.commit()
        output = diff([], content, "never", last_check)
    else:
        page = query[0]
        old_last_check = page.last_check
        if now() - old_last_check > datetime.timedelta(minutes=interval):
            content, last_check = check_url(url, data, timeout, block_from,
                                            block_to, block_skip, format_html)
            output = diff(page.content, content,
                          old_last_check, last_check)
            if output:
                page.last_updated = last_check
                page.content = content
            page.last_check = last_check
            session.commit()
    return (page, output)


def format_output(section, url, last_updated, output):
    """
    outputs to various formats
    """
    if confset['output'] == 'email':
        text = ""
        for line in output:
            text += line
            text += "\n"
        msg = MIMEText("URL: %s\nLast updated: %s\n%s" %
                       (url, last_updated, text),
                       _subtype='plain', _charset='utf-8')
        msg['Subject'] = "webdiff changes: %s" % section
        msg['From'] = confset['smtp_from']
        msg['To'] = confset['smtp_to']
        s = smtplib.SMTP(confset['smtp_server'], int(confset['smtp_port']))
        s.sendmail(confset['smtp_from'], confset['smtp_to'], msg.as_string())
        s.quit()
    elif confset['output'] == 'stdout':
        print('[ %s ]' % section)
        print('URL: %s' % url)
        print('Last updated: %s' % last_updated)
        for line in output:
            print(line)
    else:
        print("ERROR: ", 'Unknown output format "%s"' % confset['output'],
              file=sys.stderr)


def option_or_default(section, option, default):
    try:
        return conffile.get(section, option)
    except configparser.NoOptionError:
        return default


def str2bool(s):
    s.lower() in ("yes", "true", "t", "1")


if __name__ == "__main__":
    global_timeout = int(option_or_default('general', 'timeout',
                                           confset.get('timeout')))
    global_interval = int(option_or_default('general', 'interval',
                                            confset.get('interval')))
    global_block_skip = option_or_default('general', 'block_skip',
                                          confset.get('block_skip'))
    global_format_html = option_or_default('general', 'format_html_command',
                                           confset.get('format_html_command'))
    for section in conffile.sections():
        if section != 'general':
            url = conffile.get(section, 'url')
            data = option_or_default(section, 'data', None)
            timeout = int(option_or_default(section, 'timeout',
                                            global_timeout))
            interval = int(option_or_default(section, 'interval',
                                             global_interval))
            block_from = option_or_default(section, 'block_from', None)
            block_to = option_or_default(section, 'block_to', None)
            block_skip = str2bool(option_or_default(section, 'block_skip',
                                                    global_block_skip))
            format_html = option_or_default(section, 'format_html_command',
                                            global_format_html)
            page, output = check_page(section=section, url=url, data=data,
                                      timeout=timeout, interval=interval,
                                      block_from=block_from, block_to=block_to,
                                      block_skip=block_skip,
                                      format_html=format_html)
            if output:
                try:
                    next(output)
                    format_output(section, url, page.last_updated, output)
                except StopIteration:
                    pass