240 lines
6.4 KiB
Python
240 lines
6.4 KiB
Python
import os
|
|
import multiprocessing
|
|
import hashlib
|
|
import sqlite3
|
|
import requests as req
|
|
from bs4 import BeautifulSoup
|
|
#
|
|
from antiRange import AntiRange, anti_ranges
|
|
import db
|
|
import config
|
|
import opts
|
|
|
|
def try_get(url : str):
|
|
try:
|
|
return req.get(url, timeout = config.request_time_out)
|
|
except (req.exceptions.ConnectionError, req.exceptions.Timeout) as e:
|
|
print('\033[31mConnection error on {0}\033[0m'.format(url), vars(e))
|
|
return None
|
|
|
|
def print_status_got(page : int, status : int):
|
|
print('\033[32mOn page {page}, got {color}\'{status}\'\033[32m.\033[0m'
|
|
.format(page = page,
|
|
color = '\033[32m' if status == 200 else '\033[33m',
|
|
status = status
|
|
)
|
|
)
|
|
|
|
def get_threads_from_page(url : str):
|
|
response = try_get(url)
|
|
if response == None:
|
|
return
|
|
threads = BeautifulSoup(
|
|
response.text,
|
|
'html.parser'
|
|
) \
|
|
.find_all(class_='thread')
|
|
return response, threads
|
|
|
|
def get_boards_from_site():
|
|
r = try_get(config.base_url)
|
|
if r == None:
|
|
return
|
|
board_elements = BeautifulSoup(
|
|
r.text,
|
|
'html.parser'
|
|
) \
|
|
.find("select") \
|
|
.find_all("option")
|
|
boards = [db.Board(i['value'], i.text) for i in board_elements[2:]]
|
|
return boards
|
|
|
|
def archive_op(bs : BeautifulSoup, board : str):
|
|
op = bs.find(class_='op')
|
|
no = op.find_all(class_='post_no')[1].text
|
|
if db.is_post_archieved(board, int(no)):
|
|
return no
|
|
subject = op.find(class_='subject')
|
|
subject = subject.text if subject != None else ''
|
|
t = db.Post(
|
|
no = no,
|
|
poster = op.find(class_='name').text,
|
|
poster_id = op.find(class_='poster_id').text,
|
|
date = op.find('time').text,
|
|
subject = subject,
|
|
text = op.find(class_='body').decode_contents(),
|
|
board = board,
|
|
num_files = len(op.find_all(class_='file'))
|
|
)
|
|
db.insert_post(t, board)
|
|
return no
|
|
|
|
def archive_posts(op : str, bs : BeautifulSoup, board : str):
|
|
posts = bs.find_all(class_='reply')
|
|
posts.reverse()
|
|
for p in posts:
|
|
no = p.find_all(class_='post_no')[1].text
|
|
if db.is_post_archieved(board, int(no)):
|
|
return
|
|
post = db.Post(
|
|
no = no,
|
|
poster = p.find(class_='name').text,
|
|
poster_id = p.find(class_='poster_id').text,
|
|
date = p.find('time').text,
|
|
text = p.find(class_='body').decode_contents(),
|
|
thread = op,
|
|
num_files = len(p.find_all(class_='file'))
|
|
)
|
|
db.insert_post(post, board)
|
|
|
|
def archive_file(board : str, post : str, fileinfo : BeautifulSoup, c : sqlite3.Connection, clutter = False):
|
|
name = fileinfo.find('span')\
|
|
.find('span').text
|
|
path = 'files/' + hashlib.blake2s(name.encode()).hexdigest()
|
|
if not clutter and os.path.isfile(path):
|
|
print('\t\33[33mFile \033[34m\'', path, '\'\033[33m already exists.\033[0m', sep='')
|
|
return
|
|
r = try_get(config.base_url + fileinfo.find('a').attrs['href'])
|
|
if r == None:
|
|
return
|
|
with open(path, 'wb') as f:
|
|
f.write(r.content)
|
|
f = db.File(
|
|
name,
|
|
post,
|
|
board,
|
|
path
|
|
)
|
|
db.insert_file(f, c)
|
|
|
|
def archive_files(bs : BeautifulSoup, board : str):
|
|
multiprocessing.Event()
|
|
files = bs.find(class_='files')
|
|
for fileinfo in files.find_all(class_='fileinfo'):
|
|
archive_file(board,
|
|
bs.find(class_='thread').attrs['id'].split('_')[1],
|
|
fileinfo,
|
|
db.connection_pool[0]
|
|
)
|
|
thread_pool = []
|
|
for p in bs.find_all(class_='post')[1:]:
|
|
i = p.find_all(class_='fileinfo')
|
|
for fileinfo in i:
|
|
no = p.attrs['id'].split('_')[1]
|
|
con = None
|
|
while 1:
|
|
with db.connection_pool_lock:
|
|
if len(db.connection_pool) != 0:
|
|
con = db.connection_pool.pop(0)
|
|
if con == None:
|
|
db.connection_produced.wait()
|
|
else:
|
|
break
|
|
thread = multiprocessing.Process(target=archive_file, args=[board, no, fileinfo, con])
|
|
with db.connection_pool_lock:
|
|
db.connection_pool.append(con)
|
|
thread.daemon = True
|
|
thread_pool.append(thread)
|
|
thread.start()
|
|
for t in thread_pool:
|
|
t.join()
|
|
|
|
def archive_thread(url : str, board : str):
|
|
print(''.join(['\033[33mScrapping: ', url, '.\033[0m']))
|
|
response = try_get(url)
|
|
if response == None:
|
|
return
|
|
if response.url == config._404_url:
|
|
print('\033[31mThread at ', url, ' 404d. It seems like it has been deleted in the meanwhile.\033[0m')
|
|
return
|
|
p = BeautifulSoup(
|
|
response.text,
|
|
'html.parser'
|
|
)
|
|
del response
|
|
if not opts.archive_all and not config.is_thread_allegeable(p):
|
|
return
|
|
op = archive_op(p, board)
|
|
archive_posts(op, p, board)
|
|
archive_files(p, board)
|
|
|
|
def archive_threads(board_name : str, threads : list):
|
|
# the magic number '7' is len('thread_')
|
|
for t in threads:
|
|
archive_thread(
|
|
''.join([config.base_url, '/', board_name, '/res/', t.attrs['id'][7:], '.html']),
|
|
board_name
|
|
)
|
|
|
|
|
|
def archive_board(board_name : str):
|
|
board_url = config.base_url + board_name
|
|
status = 0
|
|
for i in range(config.min_page, config.max_page):
|
|
if i == 1:
|
|
url = board_url + '/index.html'
|
|
else:
|
|
url = ''.join([board_url, '/', str(i), ".html"])
|
|
try:
|
|
response, threads = get_threads_from_page(url)
|
|
except TypeError:
|
|
continue
|
|
print_status_got(i, response.status_code)
|
|
if response.url == (config._404_url):
|
|
return
|
|
elif response.status_code != 200: # add better error handling
|
|
#talom['board_url'] = ['board', 5]
|
|
continue
|
|
archive_threads(board_name, threads)
|
|
|
|
|
|
def repair_corrupted(board : str, op : str, no : str):
|
|
response = try_get(''.join([config.base_url, '/', board, '/res/', op, '.html']))
|
|
if response == None:
|
|
return
|
|
thread = BeautifulSoup(
|
|
response.text,
|
|
'html.parser'
|
|
)
|
|
posts = thread.find_all(class_='post')
|
|
fileinfos = None
|
|
l = 0
|
|
h = len(posts)-1
|
|
while 1:
|
|
c = int((l + h) / 2)
|
|
n = posts[c].attrs['id'].split('_')[1]
|
|
if n == no:
|
|
fileinfos = posts[c].find_all(class_='fileinfo')
|
|
break
|
|
if h - l < 2:
|
|
hno = posts[h].attrs['id'].split('_')[1]
|
|
if hno == no:
|
|
fileinfos = posts[h].find_all(class_='fileinfo')
|
|
break
|
|
if n < no:
|
|
l = c
|
|
else:
|
|
h = c
|
|
if fileinfos == None:
|
|
print('\033[31mCould not fetch fileinfos for \033[34m(', board, ', ', no, ')\033[31m.\033[0m', sep='' )
|
|
return
|
|
thread_pool = []
|
|
for fi in fileinfos:
|
|
while 1:
|
|
with db.connection_pool_lock:
|
|
if len(db.connection_pool) != 0:
|
|
con = db.connection_pool.pop(0)
|
|
if con == None:
|
|
db.connection_produced.wait()
|
|
else:
|
|
break
|
|
thread = multiprocessing.Process(target=archive_file, args=[board, no, fi, con, True])
|
|
with db.connection_pool_lock:
|
|
db.connection_pool.append(con)
|
|
thread.daemon = True
|
|
thread_pool.append(thread)
|
|
thread.start()
|
|
for t in thread_pool:
|
|
t.join()
|
|
print('\033[32mRepaired: \033[34m', board, '/', no, '\033[32m.\033[0m', sep='')
|