Files
vichan_scrapper/scrapper/scrap.py
2024-01-25 18:37:18 +01:00

240 lines
6.4 KiB
Python

import os
import multiprocessing
import hashlib
import sqlite3
import requests as req
from bs4 import BeautifulSoup
#
from antiRange import AntiRange, anti_ranges
import db
import config
import opts
def try_get(url : str):
try:
return req.get(url, timeout = config.request_time_out)
except (req.exceptions.ConnectionError, req.exceptions.Timeout) as e:
print('\033[31mConnection error on {0}\033[0m'.format(url), vars(e))
return None
def print_status_got(page : int, status : int):
print('\033[32mOn page {page}, got {color}\'{status}\'\033[32m.\033[0m'
.format(page = page,
color = '\033[32m' if status == 200 else '\033[33m',
status = status
)
)
def get_threads_from_page(url : str):
response = try_get(url)
if response == None:
return
threads = BeautifulSoup(
response.text,
'html.parser'
) \
.find_all(class_='thread')
return response, threads
def get_boards_from_site():
r = try_get(config.base_url)
if r == None:
return
board_elements = BeautifulSoup(
r.text,
'html.parser'
) \
.find("select") \
.find_all("option")
boards = [db.Board(i['value'], i.text) for i in board_elements[2:]]
return boards
def archive_op(bs : BeautifulSoup, board : str):
op = bs.find(class_='op')
no = op.find_all(class_='post_no')[1].text
if db.is_post_archieved(board, int(no)):
return no
subject = op.find(class_='subject')
subject = subject.text if subject != None else ''
t = db.Post(
no = no,
poster = op.find(class_='name').text,
poster_id = op.find(class_='poster_id').text,
date = op.find('time').text,
subject = subject,
text = op.find(class_='body').decode_contents(),
board = board,
num_files = len(op.find_all(class_='file'))
)
db.insert_post(t, board)
return no
def archive_posts(op : str, bs : BeautifulSoup, board : str):
posts = bs.find_all(class_='reply')
posts.reverse()
for p in posts:
no = p.find_all(class_='post_no')[1].text
if db.is_post_archieved(board, int(no)):
return
post = db.Post(
no = no,
poster = p.find(class_='name').text,
poster_id = p.find(class_='poster_id').text,
date = p.find('time').text,
text = p.find(class_='body').decode_contents(),
thread = op,
num_files = len(p.find_all(class_='file'))
)
db.insert_post(post, board)
def archive_file(board : str, post : str, fileinfo : BeautifulSoup, c : sqlite3.Connection, clutter = False):
name = fileinfo.find('span')\
.find('span').text
path = 'files/' + hashlib.blake2s(name.encode()).hexdigest()
if not clutter and os.path.isfile(path):
print('\t\33[33mFile \033[34m\'', path, '\'\033[33m already exists.\033[0m', sep='')
return
r = try_get(config.base_url + fileinfo.find('a').attrs['href'])
if r == None:
return
with open(path, 'wb') as f:
f.write(r.content)
f = db.File(
name,
post,
board,
path
)
db.insert_file(f, c)
def archive_files(bs : BeautifulSoup, board : str):
multiprocessing.Event()
files = bs.find(class_='files')
for fileinfo in files.find_all(class_='fileinfo'):
archive_file(board,
bs.find(class_='thread').attrs['id'].split('_')[1],
fileinfo,
db.connection_pool[0]
)
thread_pool = []
for p in bs.find_all(class_='post')[1:]:
i = p.find_all(class_='fileinfo')
for fileinfo in i:
no = p.attrs['id'].split('_')[1]
con = None
while 1:
with db.connection_pool_lock:
if len(db.connection_pool) != 0:
con = db.connection_pool.pop(0)
if con == None:
db.connection_produced.wait()
else:
break
thread = multiprocessing.Process(target=archive_file, args=[board, no, fileinfo, con])
with db.connection_pool_lock:
db.connection_pool.append(con)
thread.daemon = True
thread_pool.append(thread)
thread.start()
for t in thread_pool:
t.join()
def archive_thread(url : str, board : str):
print(''.join(['\033[33mScrapping: ', url, '.\033[0m']))
response = try_get(url)
if response == None:
return
if response.url == config._404_url:
print('\033[31mThread at ', url, ' 404d. It seems like it has been deleted in the meanwhile.\033[0m')
return
p = BeautifulSoup(
response.text,
'html.parser'
)
del response
if not opts.archive_all and not config.is_thread_allegeable(p):
return
op = archive_op(p, board)
archive_posts(op, p, board)
archive_files(p, board)
def archive_threads(board_name : str, threads : list):
# the magic number '7' is len('thread_')
for t in threads:
archive_thread(
''.join([config.base_url, '/', board_name, '/res/', t.attrs['id'][7:], '.html']),
board_name
)
def archive_board(board_name : str):
board_url = config.base_url + board_name
status = 0
for i in range(config.min_page, config.max_page):
if i == 1:
url = board_url + '/index.html'
else:
url = ''.join([board_url, '/', str(i), ".html"])
try:
response, threads = get_threads_from_page(url)
except TypeError:
continue
print_status_got(i, response.status_code)
if response.url == (config._404_url):
return
elif response.status_code != 200: # add better error handling
#talom['board_url'] = ['board', 5]
continue
archive_threads(board_name, threads)
def repair_corrupted(board : str, op : str, no : str):
response = try_get(''.join([config.base_url, '/', board, '/res/', op, '.html']))
if response == None:
return
thread = BeautifulSoup(
response.text,
'html.parser'
)
posts = thread.find_all(class_='post')
fileinfos = None
l = 0
h = len(posts)-1
while 1:
c = int((l + h) / 2)
n = posts[c].attrs['id'].split('_')[1]
if n == no:
fileinfos = posts[c].find_all(class_='fileinfo')
break
if h - l < 2:
hno = posts[h].attrs['id'].split('_')[1]
if hno == no:
fileinfos = posts[h].find_all(class_='fileinfo')
break
if n < no:
l = c
else:
h = c
if fileinfos == None:
print('\033[31mCould not fetch fileinfos for \033[34m(', board, ', ', no, ')\033[31m.\033[0m', sep='' )
return
thread_pool = []
for fi in fileinfos:
while 1:
with db.connection_pool_lock:
if len(db.connection_pool) != 0:
con = db.connection_pool.pop(0)
if con == None:
db.connection_produced.wait()
else:
break
thread = multiprocessing.Process(target=archive_file, args=[board, no, fi, con, True])
with db.connection_pool_lock:
db.connection_pool.append(con)
thread.daemon = True
thread_pool.append(thread)
thread.start()
for t in thread_pool:
t.join()
print('\033[32mRepaired: \033[34m', board, '/', no, '\033[32m.\033[0m', sep='')