Projekt

Obecné

Profil

Podání #34950 » import_senate_votes.py

Marek Křejpský, 13.03.2021 13:06

 
# -*- coding: utf-8 -*-
"""Simple RSS to HTML converter."""

__version__ = "0.0.2"
__author__ = "Barbierosa"



import sys
import requests
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET


sys.stdin.reconfigure(encoding='utf-8')
sys.stdout.reconfigure(encoding='utf-8')
separator=' '

def monthToNum(shortMonth):

return {
'jan' : 1,
'feb' : 2,
'mar' : 3,
'apr' : 4,
'may' : 5,
'jun' : 6,
'jul' : 7,
'aug' : 8,
'sep' : 9,
'oct' : 10,
'nov' : 11,
'dec' : 12
}[shortMonth]

# scraping function
def get_rss(url,count):
article_list = []

try:
r = requests.get(url)
soup = BeautifulSoup(r.content, features='xml')

articles = soup.findAll('item')
i=int(0)
for a in articles:
title = a.find('title').text
link = a.find('link').text or 'https://senat.cz'
published = a.find('pubDate').text
description = a.find('description').text
article = {
'title': title,
'link': link,
'published': published,
'description': description
}
article_list.append(article)
i=i+1
if (i >= int(count)):
break

return article_list


except Exception as e:
print('The scraping job failed. See exception: ')
print(e)

if len(sys.argv) > 1:
num_votes_to_import = int(sys.argv[1])
else:
num_votes_to_import=int(999999)

print('Starting scraping for '+str(num_votes_to_import))

xml_articles=get_rss('https://www.senat.cz/senatori/hlasovani_rss.php?pid=343',num_votes_to_import)

for a in xml_articles:
# print ('<li>')
# print ('<a href="'+a['link']+'">')
da,day,month,year,time,zone=a['published'].split()

datum=str(day+"."+str(monthToNum(month.lower()))+'.'+year)
# print ("<br>")
fields = a['title'].split('-')
fields.pop(0)
fields.pop(0)
fields.pop(0)
nadpis=str(separator.join(fields))
# print ("<br>")
fields = a['description'].split('-')

print (datum+" "+nadpis+' === '+fields[1]+':'+fields[2].split(';')[0])
# print ('</a></li>')

print('Finished scraping')
    (1-1/1)