aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFélix Sipma <felix.sipma@no-log.org>2016-11-22 16:19:40 +0100
committerFélix Sipma <felix.sipma@no-log.org>2016-11-22 16:19:40 +0100
commitfa78eadc5ffd06d47085630bbac08e7bd3229352 (patch)
treee924e18c1d634a2f535df885f0395a99f4dd768f
parent36e363ef6a88103e9c954f96b4e009981fcede6d (diff)
update leboncoin.py
-rwxr-xr-xdoc/filters/leboncoin.py44
1 files changed, 23 insertions, 21 deletions
diff --git a/doc/filters/leboncoin.py b/doc/filters/leboncoin.py
index 67c6b7e..7b440ce 100755
--- a/doc/filters/leboncoin.py
+++ b/doc/filters/leboncoin.py
@@ -83,18 +83,15 @@ html = sys.stdin.read().decode('iso-8859-1')
soup = BeautifulSoup(html, "lxml")
# region = soup.find(class_="entete").attrs.get('title')
-page_title = soup.head.find("meta", {"name": "title"}).attrs.get("content")
+page_title = soup.head.find("title")
+if not page_title:
+ logger.error("Can't find page_title")
+page_title = page_title.contents[0].strip()
if not feed_title:
feed_title = "LeBonCoin.fr - " + page_title
-if soup.body.find("div", {"class": "list-lbc"}):
- entries = soup.body.find(
- "div",
- {"class": "list-lbc"}
- ).find_all("a", recursive=False)
-else:
- entries = []
+entries = soup.body.find_all(itemtype="http://schema.org/Offer")
# Build rss items based on entries
items = []
@@ -108,29 +105,34 @@ class MyParserInfo(parserinfo):
MONTHS = [parserinfo.MONTHS[x] + months[x] for x in range(12)]
for entry in entries:
+ # title = entry.find("div", {"class": "title"}).text.strip()
+ title = entry.a.attrs.get("title")
+ if not title:
+ logger.error("Can't find title for entry.")
+ link = entry.a.attrs.get("href")
+ if not link:
+ logger.error("Can't find link for entry %s.", title)
if entry.img:
photo_title = entry.img.attrs.get("alt")
photo_link = entry.img.attrs.get("src")
category = re.sub(
r"\s+",
" ",
- entry.find("div", {"class": "category"}).text.strip()
+ entry.find(itemprop="category").text.strip()
)
- rawprice = entry.find("div", {"class": "price"})
+ rawprice = entry.find(itemprop="price")
if rawprice:
price = rawprice.text.strip()
else:
price = ""
- placement = re.sub(
- r"\s",
- "",
- entry.find("div", {"class": "placement"}).text
+ place = re.sub(
+ r"\s+",
+ " ",
+ entry.find(itemprop="availableAtOrFrom").text.strip()
)
- # title = entry.find("div", {"class": "title"}).text.strip()
- title = entry.attrs.get("title")
- link = entry.attrs.get("href")
- divs = entry.find("div", {"class": "date"}).find_all("div")
- divdate = " ".join([x.text for x in divs])
+ if not place:
+ logger.error("Can't find place for entry %s", title)
+ divdate = entry.find(itemprop="availabilityStarts").text.strip()
predate = re.sub("(?i)hier", yesterday.isoformat(), divdate)
predate = re.sub("(?i)aujourd'hui", today.isoformat(), predate)
@@ -139,10 +141,10 @@ for entry in entries:
date = parse(predate, fuzzy=True, parserinfo=MyParserInfo())
items.append(
PyRSS2Gen.RSSItem(
- title=title + " (" + placement + ") " + price,
+ title=title + " (" + place + ") " + price,
link=link,
categories=[category],
- description=title + " (" + placement + ") " + price +
+ description=title + " (" + place + ") " + price +
"; publication: " + divdate,
pubDate=date,
)