kincsem.hu_scapper/lovi_scrapper.py
2024-04-09 23:21:18 +02:00

52 lines
1.3 KiB
Python

#!/bin/python
import re
import json
import requests as reqs
import bs4
def get_page(url : str) -> str:
print(f"\033[33mLoading '{url}'...\033[0m")
p = reqs.get(url)
print(f"\033[32mLoaded '{url}'!\033[0m")
return p.text
def js_coll_2_py(c : str):
r = c.replace(' true,', ' True,')
return eval(r)
def collect_horsey_pages(txt : str) -> [str]:
r = []
day_list_regex = r"(?<=racing_days = )\[.*\](?=;)"
matches = re.findall(day_list_regex, txt)
for m in matches:
v = js_coll_2_py(m)
for e in v:
r.append(e['date'])
return r
def parse_horsey_page(s : str) -> [dict]:
r = []
info_dump_regex = r'(?<=races_table_divs)\[".*"\] = {.*}(?=;)'
matches = re.findall(info_dump_regex, s)
for m in matches:
r.append(m[m.find('{') + 1:])
return r
def main():
year_range = range(1996, 2024 + 1)
year_url = 'https://mla.kincsempark.hu/racing-days/gallop/{year}'
date_url = 'https://mla.kincsempark.hu/results/gallop/{date}/'
for i in year_range:
print(f"\033[33;1m# Iterating '{i}'.\033[0m")
for h in collect_horsey_pages(get_page(year_url.format(year=i))):
data = parse_horsey_page(get_page(date_url.format(date=h)))
filename = f"out/{h}.json"
print(f"\033[36mSaving file '{filename}'.\033[0m")
with open(filename, 'w') as f:
json.dump(data, f)
print(f"\033[32;1m# Done iterating '{i}'.\033[0m")
if __name__ == '__main__':
main()