-
Notifications
You must be signed in to change notification settings - Fork 12
/
deliciousmonitor.py
193 lines (160 loc) · 7.29 KB
/
deliciousmonitor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
"""
A module to monitor a delicious.com bookmark RSS feed and store it with some additional metadata to file.
(c) 2006-2008 Michael G. Noll <http://www.michael-noll.com/>
"""
import codecs
import datetime
import os
import sys
import time
try:
import deliciousapi
except:
print "ERROR: could not import DeliciousAPI module"
print
print "You can download DeliciousAPI from the Python Cheese Shop at"
print "http://pypi.python.org/pypi/DeliciousAPI"
print
try:
import feedparser
except:
print "ERROR: could not import Universal Feed Parser module"
print
print "You can download Universal Feed Parser from the Python Cheese Shop at"
print "http://pypi.python.org/pypi/FeedParser"
print
raise
class DeliciousMonitor(object):
"""Monitors a delicious.com bookmark RSS feed, retrieves metadata for each bookmark and stores it to file.
By default, the delicious.com hotlist (i.e. the front page) is monitored.
Whenever the monitor discovers a new URL in a bookmark, it retrieves
some metadata for it from delicious.com (currently, common tags and number
of bookmarks) and stores this information to file.
Note that URLs which have been processed in previous runs will NOT be
processed again, i.e. delicious.com metadata will NOT be updated.
"""
def __init__(self, rss_url="http://feeds.delicious.com/v2/rss", filename="delicious-monitor.xml", log_filename="delicious-monitor.log", interval=30, verbose=True):
"""
Parameters:
rss_url (optional, default: "http://feeds.delicious.com/v2/rss")
The URL of the RSS feed to monitor.
filename (optional, default: "delicious-monitor.xml")
The name of the file to which metadata about the RSS feed will be stored.
log_filename (optional, default: "delicious-monitor.log")
The name of the log file, which is used to identify "new" entries in the RSS feed.
interval (optional, default: 30)
Time between monitor runs in minutes.
verbose (optional, default: True)
Whether to print non-critical processing information to STDOUT or not.
"""
self.rss_url = rss_url
self._delicious = deliciousapi.DeliciousAPI()
self.filename = filename
self.log_filename = log_filename
self.interval = interval
self.verbose = verbose
self.urls = []
# ensure that the name of the output file and log file is not None etc.
assert self.filename
assert self.log_filename
def run(self):
"""Start the monitor."""
while True:
time_before_run = datetime.datetime.now()
# do the actual monitoring work
if self.verbose:
print "[MONITOR] Starting monitor run - %s" % time_before_run.strftime("%Y-%m-%d @ %H:%M:%S")
self.monitor()
time_after_run = datetime.datetime.now()
# calculate the number of seconds to wait until the next run
interval = datetime.timedelta(seconds=60*self.interval)
next_run_time = time_before_run + interval
elapsed = time_after_run - time_before_run
if interval >= elapsed:
wait_seconds = (interval - elapsed).seconds
else:
# the run took longer than our interval time between runs;
# in this case, we continue immediately but will still wait
# three seconds in order not to stress delicious.com too much
wait_seconds = 3
next_run_time = datetime.datetime.now() + datetime.timedelta(seconds=wait_seconds)
# sleep until the next run
if self.verbose:
print "[MONITOR] Next monitor run on %s (sleeping for %s seconds)" % (next_run_time.strftime("%Y-%m-%d @ %H:%M:%S"), wait_seconds)
time.sleep(wait_seconds)
def monitor(self):
"""Monitors an RSS feed."""
# download and parse RSS feed
f = feedparser.parse(self.rss_url)
output_file = codecs.open(self.filename, "a", "utf8")
log_file = None
if os.access(self.log_filename, os.F_OK):
if self.verbose:
print "[MONITOR] Log file found. Trying to resume...",
try:
# read in previous log data for resuming
log_file = open(self.log_filename, 'r')
# remove leading and trailing whitespace if any (incl. newlines)
self.urls = [line.strip() for line in log_file.readlines()]
log_file.close()
if self.verbose:
print "done"
except IOError:
# most probably, the log file does not exist (yet)
if self.verbose:
print "failed"
else:
# log file does not exist, so there isn't any resume data
# to read in
pass
try:
# now open it for writing (i.e., appending) and logging
if self.verbose:
print "[MONITOR] Open log file for appending...",
log_file = open(self.log_filename, 'a')
if self.verbose:
print "done"
except IOError:
if self.verbose:
print "failed"
print "[MONITOR] ERROR: could not open log file for appending"
self._cleanup()
return
# get only new entries
new_entries = []
for entry in f.entries:
new_entries = [entry for entry in f.entries if entry.link not in self.urls]
if self.verbose:
print "[MONITOR] Found %s new entries" % len(new_entries)
# query metadata about each entry from delicious.com
for index, entry in enumerate(new_entries):
url = entry.link
if self.verbose:
print "[MONITOR] Processing entry #%s: '%s'" % (index + 1, url),
try:
time.sleep(1) # be nice and wait 1 sec between connects to delicious.com
document = self._delicious.get_url(url)
except (deliciousapi.DeliciousError,), error_string:
if self.verbose:
print "failed"
print "[MONITOR] ERROR: %s" % error_string
# clean up
output_file.close()
log_file.close()
return
if self.verbose:
print "done"
# update log file
log_file.write("%s\n" % url)
# update output file
output_file.write('<document url="%s" users="%s" top_tags="%s">\n' % (url, document.total_bookmarks, len(document.top_tags)))
for tag, count in document.top_tags:
output_file.write(' <top_tag name="%s" count="%s" />\n' % (tag, count))
output_file.write('</document>\n')
output_file.flush()
# clean up
output_file.close()
log_file.close()
if __name__ == "__main__":
monitor = DeliciousMonitor(interval=30)
monitor.run()