Projekt

Obecné

Profil

Podání #34950 » import_senate_votes.py

Marek Křejpský, 13.03.2021 13:06

 
1
# -*- coding: utf-8 -*-
2
"""Simple RSS to HTML converter."""
3

    
4
__version__ = "0.0.2"
5
__author__ = "Barbierosa"
6

    
7

    
8

    
9
import sys
10
import requests
11
from bs4 import BeautifulSoup
12
import xml.etree.ElementTree as ET
13

    
14

    
15
sys.stdin.reconfigure(encoding='utf-8')
16
sys.stdout.reconfigure(encoding='utf-8')
17
separator=' '
18

    
19
def monthToNum(shortMonth):
20

    
21
    return {
22
            'jan' : 1,
23
            'feb' : 2,
24
            'mar' : 3,
25
            'apr' : 4,
26
            'may' : 5,
27
            'jun' : 6,
28
            'jul' : 7,
29
            'aug' : 8,
30
            'sep' : 9,
31
            'oct' : 10,
32
            'nov' : 11,
33
            'dec' : 12
34
    }[shortMonth]
35

    
36
# scraping function
37
def get_rss(url,count):
38
    article_list = []
39

    
40
    try:
41
        r = requests.get(url)
42
        soup = BeautifulSoup(r.content, features='xml')
43

    
44
        articles = soup.findAll('item')
45
        i=int(0)
46
        for a in articles:
47
            title = a.find('title').text
48
            link = a.find('link').text or 'https://senat.cz'
49
            published = a.find('pubDate').text
50
            description = a.find('description').text
51
            article = {
52
                'title': title,
53
                'link': link,
54
                'published': published,
55
                'description': description
56
                }
57
            article_list.append(article)
58
            i=i+1
59
            if (i >= int(count)):
60
                break
61

    
62
        return article_list
63

    
64

    
65
    except Exception as e:
66
        print('The scraping job failed. See exception: ')
67
        print(e)
68

    
69
if len(sys.argv) > 1:
70
    num_votes_to_import = int(sys.argv[1])
71
else:
72
    num_votes_to_import=int(999999)
73

    
74
print('Starting scraping for '+str(num_votes_to_import))
75

    
76
xml_articles=get_rss('https://www.senat.cz/senatori/hlasovani_rss.php?pid=343',num_votes_to_import)
77

    
78
for a in xml_articles:
79
#    print ('<li>')
80
#    print ('<a href="'+a['link']+'">')
81
    da,day,month,year,time,zone=a['published'].split()
82

    
83
    datum=str(day+"."+str(monthToNum(month.lower()))+'.'+year)
84
#    print ("<br>")
85
    fields = a['title'].split('-')
86
    fields.pop(0)
87
    fields.pop(0)
88
    fields.pop(0)    
89
    nadpis=str(separator.join(fields))
90
#    print ("<br>")
91
    
92
    fields = a['description'].split('-')
93

    
94
    print (datum+"   "+nadpis+'  ===  '+fields[1]+':'+fields[2].split(';')[0])    
95
#    print ('</a></li>')
96

    
97
print('Finished scraping')
    (1-1/1)