init
This commit is contained in:
.gitignoreMakefileREADME.mdconfig.mk
db
docs
front_end
404.phpandroid-chrome-192x192.pngandroid-chrome-512x512.pngapple-touch-icon.pngboard.phpcolor_hash.phpconfig.phpdata.sqlitedownloads.phpfavicon-16x16.pngfavicon-32x32.pngfavicon.icofilesglobal.cssglobal.phpindex.php
js
media
playground.phppost.phpsearch.phpsite.webmanifestscrapper
antiRange.pyconfig.pydata.sqlitedb.pyfilesmain.pyopts.pyrequirements.txtrun.shscrap.py
service
threadpool.pyusage.pysrv
9
.gitignore
vendored
Normal file
9
.gitignore
vendored
Normal file
@ -0,0 +1,9 @@
|
||||
**/__pycache__/
|
||||
scrapper/venv
|
||||
scrapper/service/scrapper.lock
|
||||
vichan_archive.tar.gz
|
||||
vichan_archive_data.tar.gz
|
||||
db/data.sqlite
|
||||
db/data.sqlite.bak
|
||||
db/files/*
|
||||
db/files.bak
|
52
Makefile
Normal file
52
Makefile
Normal file
@ -0,0 +1,52 @@
|
||||
pass:
|
||||
@echo pass
|
||||
|
||||
# ----
|
||||
include config.mk
|
||||
|
||||
VHOSTS_D := $(APACHE_CONFIG_DIR)/vhosts.d/
|
||||
SHELL := /bin/bash
|
||||
|
||||
clone:
|
||||
tar -I 'gzip --best' -c db/data.sqlite db/files/ -f front_end/vichan_archive_data.tar.gz
|
||||
git archive --output=front_end/vichan_archive.tar.gz master
|
||||
|
||||
|
||||
init_db:
|
||||
-mv db/data.sqlite db/data.sqlite.bak
|
||||
-rm -r db/files.bak
|
||||
-mv db/files db/files.bak
|
||||
-mkdir db/files; touch db/files/.placeholder
|
||||
cd db/; cat init.sql | sqlite3
|
||||
|
||||
|
||||
init_python:
|
||||
cd scrapper/; \
|
||||
python -m venv venv; \
|
||||
source venv/bin/activate; \
|
||||
pip install -r requirements.txt
|
||||
|
||||
|
||||
init: init_db init_python
|
||||
|
||||
|
||||
server:
|
||||
-mkdir $(VHOSTS_D)
|
||||
m4 srv/archive.apache2.vhost.conf.m4 > $(VHOSTS_D)/archive.conf
|
||||
|
||||
service:
|
||||
cd scrapper/service/; \
|
||||
m4 cron.m4 > /etc/cron.d/fc_scrapper
|
||||
|
||||
scrap:
|
||||
cd scrapper/; \
|
||||
./run.sh
|
||||
|
||||
repair:
|
||||
cd scrapper/; \
|
||||
./run.sh -r
|
||||
|
||||
restore:
|
||||
-rm db/data.sqlite
|
||||
-cp db/data.sqlite.bak db/data.sqlite
|
||||
|
60
README.md
Normal file
60
README.md
Normal file
@ -0,0 +1,60 @@
|
||||
# Vichan Scrapper
|
||||
> scrapper for archiving data from vichan instances with a minimalistic frontend included to ease local viewing
|
||||
|
||||
### Demo
|
||||

|
||||
|
||||

|
||||
|
||||
### Disclaimer
|
||||
The scrapper worked pretty well for the specific instance it was built around / tested on,
|
||||
however it's possible that it will fail on different version (which is unknown)
|
||||
and or configuration.
|
||||
|
||||
### Requirements
|
||||
+ Python3
|
||||
+ Sqlite3
|
||||
+ Apache2 or PHP (see Installation/Server)
|
||||
|
||||
### Installation
|
||||
1. Meet the requirements
|
||||
#### Base
|
||||
```sh
|
||||
$ make init # initialize databse and python environment
|
||||
$ make clone # create tarballs for distribution
|
||||
```
|
||||
##### Server
|
||||
###### For yourself
|
||||
+ If you don't have to have the front end to be publicly facing, its easiest to use the built in server of the PHP interpreter.
|
||||
1. Navigate to the front end's directory
|
||||
```sh
|
||||
$ cd <vichan_scrapper>/front_end/
|
||||
```
|
||||
2. Deploy the PHP server
|
||||
```sh
|
||||
$ php -S localhost:8000
|
||||
```
|
||||
3. Access it through your browser on the address:
|
||||
localhost:8000
|
||||
###### In production
|
||||
1. Run apache
|
||||
2. Add configs
|
||||
```sh
|
||||
$ make server
|
||||
```
|
||||
##### Schedule scrapper
|
||||
```sh
|
||||
$ make service
|
||||
```
|
||||
+ the default is to scrapp every hour, and attempt to get missing files every 3.5 hours
|
||||
+ for personilazation see scrapper/service/cron.m4
|
||||
|
||||
### Configuration
|
||||
#### Scrapper
|
||||
#### Front end
|
||||
+ front\_end/config.php
|
||||
- posts\_per\_page : int
|
||||
- search\_enabled : boolean
|
||||
|
||||
#### Recommendations
|
||||
+ use XFS
|
1
config.mk
Normal file
1
config.mk
Normal file
@ -0,0 +1 @@
|
||||
APACHE_CONFIG_DIR := /etc/apache2/vhosts.d/
|
50
db/init.sql
Normal file
50
db/init.sql
Normal file
@ -0,0 +1,50 @@
|
||||
.open data.sqlite
|
||||
|
||||
-- --------------------------------------------------------
|
||||
|
||||
--
|
||||
-- Table structure for table `boards`
|
||||
--
|
||||
|
||||
DROP TABLE IF EXISTS boards;
|
||||
CREATE TABLE boards (name VARCHAR(10) PRIMARY KEY,
|
||||
desc VARCHAR(24)
|
||||
);
|
||||
|
||||
-- --------------------------------------------------------
|
||||
|
||||
--
|
||||
-- Table structure for table `posts`
|
||||
--
|
||||
|
||||
DROP TABLE IF EXISTS posts;
|
||||
CREATE TABLE posts (
|
||||
id INTEGER,
|
||||
board INTEGER,
|
||||
thread INT(11) DEFAULT NULL,
|
||||
subject VARCHAR(100) DEFAULT NULL,
|
||||
email VARCHAR(30) DEFAULT NULL,
|
||||
name VARCHAR(35) DEFAULT NULL,
|
||||
trip VARCHAR(15) DEFAULT NULL,
|
||||
capcode VARCHAR(50) DEFAULT NULL,
|
||||
body text,
|
||||
time VARCHAR(30),
|
||||
num_files INT(11) DEFAULT 0, -- Used for integrity checks, NOT redundant
|
||||
--`filehash` text CHARACTER SET ascii
|
||||
PRIMARY KEY (id, board)
|
||||
);
|
||||
|
||||
-- --------------------------------------------------------
|
||||
|
||||
--
|
||||
-- Table structure for table `files`
|
||||
--
|
||||
|
||||
DROP TABLE IF EXISTS files;
|
||||
CREATE TABLE files (
|
||||
id INTEGER PRIMARY KEY,
|
||||
name TEXT,
|
||||
post INTEGER NOT NULL,
|
||||
board INTEGER NOT NULL,
|
||||
path text
|
||||
);
|
210
docs/Retrowave Color Palette (4_8_2023 2_03_51 PM).html
Normal file
210
docs/Retrowave Color Palette (4_8_2023 2_03_51 PM).html
Normal file
File diff suppressed because one or more lines are too long
BIN
docs/demo1.png
Normal file
BIN
docs/demo1.png
Normal file
Binary file not shown.
After ![]() (image error) Size: 2.6 MiB |
BIN
docs/demo2.png
Normal file
BIN
docs/demo2.png
Normal file
Binary file not shown.
After ![]() (image error) Size: 2.0 MiB |
27
front_end/404.php
Normal file
27
front_end/404.php
Normal file
@ -0,0 +1,27 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Example Archive - 404</title>
|
||||
<link rel="stylesheet" href="global.css">
|
||||
<meta charset="utf-8">
|
||||
<style>
|
||||
#_404_container {
|
||||
display: flex;
|
||||
justify-content: center;
|
||||
}
|
||||
#_404_container img {
|
||||
width: 60%;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id=body_main>
|
||||
<div id=index_header>
|
||||
<p>Vichan Archive - 404</p>
|
||||
</div>
|
||||
<div id=_404_container>
|
||||
<img id="404" src="media/apu404.png" alt="apu404"></img>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
BIN
front_end/android-chrome-192x192.png
Normal file
BIN
front_end/android-chrome-192x192.png
Normal file
Binary file not shown.
After ![]() (image error) Size: 24 KiB |
BIN
front_end/android-chrome-512x512.png
Normal file
BIN
front_end/android-chrome-512x512.png
Normal file
Binary file not shown.
After ![]() (image error) Size: 135 KiB |
BIN
front_end/apple-touch-icon.png
Normal file
BIN
front_end/apple-touch-icon.png
Normal file
Binary file not shown.
After ![]() (image error) Size: 22 KiB |
110
front_end/board.php
Normal file
110
front_end/board.php
Normal file
@ -0,0 +1,110 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<?php
|
||||
require_once('global.php');
|
||||
require_once('config.php');
|
||||
|
||||
# Query validation
|
||||
if(!isset($_GET['page'])){
|
||||
$page = 1;
|
||||
}else{
|
||||
$page = intval($_GET['page']);
|
||||
}
|
||||
if(validate_board_name($_GET['board'])){
|
||||
$board = $_GET['board'];
|
||||
}else{
|
||||
header('Location: /404.php');
|
||||
die();
|
||||
}
|
||||
?>
|
||||
<head>
|
||||
<title>Examplechan - Archive /<?=$board?>/</title>
|
||||
<link rel="stylesheet" href="global.css">
|
||||
<meta charset="utf-8">
|
||||
<style>
|
||||
span {
|
||||
color: #06df20;
|
||||
}
|
||||
|
||||
.thread {
|
||||
box-sizing: border-box;
|
||||
padding: 20px;
|
||||
}
|
||||
.thread:hover {
|
||||
background: teal;
|
||||
cursor: pointer;
|
||||
}
|
||||
.thread img {
|
||||
width: 200px;
|
||||
}
|
||||
.file {
|
||||
max-height: 400px;
|
||||
overflow-y: hidden;
|
||||
}
|
||||
.page_list {
|
||||
text-align: center;
|
||||
font-size: 2rem;
|
||||
color: yellow;
|
||||
font-weight: bold;
|
||||
}
|
||||
.page_list a:link {
|
||||
color: lime;
|
||||
font-weight: normal;
|
||||
}
|
||||
.page_list a:visited {
|
||||
color: lightgreen;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id=body_main>
|
||||
<div id=index_header>
|
||||
<p>Examplechan Archive - /<?=$board?>/</p>
|
||||
<a href="/">
|
||||
<img id=plant src="media/plant.png" alt="fc_logo"></img>
|
||||
</a>
|
||||
</div>
|
||||
<div class=page_list>
|
||||
[
|
||||
<?php
|
||||
$post_count = $db->querySingle('SELECT COUNT(*) count FROM posts WHERE board = \'/' . $_GET['board'] . '/\' and thread is NULL;');
|
||||
$page_count = ceil($post_count / $config['posts_per_page']);
|
||||
for($i = 0; $i < $page_count; $i++):
|
||||
?>
|
||||
<a href="/board.php?board=<?=$board?>&page=<?=$i+1?>"><?=$i+1?></a>
|
||||
<?php
|
||||
endfor;
|
||||
?>
|
||||
]
|
||||
</div>
|
||||
<hr>
|
||||
<hr>
|
||||
<?php
|
||||
$query = 'SELECT * FROM posts WHERE ' .
|
||||
'board = \'/' . $board . '/\' ' .
|
||||
'AND ' .
|
||||
'thread IS NULL ' .
|
||||
'ORDER BY id DESC ' .
|
||||
'LIMIT ' . $config['posts_per_page'] . ' ' .
|
||||
'OFFSET ' . ($config['posts_per_page']*($page-1)) . ';';
|
||||
$results = $db->query($query);
|
||||
|
||||
while($row = $results->fetchArray()):
|
||||
?>
|
||||
<div class="thread" onclick="window.location='/post.php?board=<?=$board?>&post=<?=$row['id']?>';">
|
||||
<div>
|
||||
<?=print_post_head($row)?>
|
||||
<div class='files'>
|
||||
<?=print_files($row['id'], $row['board'])?>
|
||||
</div>
|
||||
<div class='post_body'>
|
||||
<?=$row['body']?>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<hr>
|
||||
<?php endwhile; ?>
|
||||
<script id=page_list_duplicator type="text/javascript" src="js/duplicate_page_list.js"></script>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
31
front_end/color_hash.php
Normal file
31
front_end/color_hash.php
Normal file
@ -0,0 +1,31 @@
|
||||
<?php
|
||||
define('COLORS', [
|
||||
["#ff0000", "white"], /* Red */
|
||||
["#ffa500", "black"], /* Orange */
|
||||
["#ffff00", "black"], /* Yellow */
|
||||
["#00ff00", "black"], /* Lime */
|
||||
["#008000", "white"], /* Green */
|
||||
["#00ffff", "black"], /* Aquamarine */
|
||||
["#00bfff", "white"], /* Cyan */
|
||||
["#0000ff", "white"], /* Blue */
|
||||
["#4b0082", "white"], /* Indigo */
|
||||
["#ffc0cb", "black"], /* Pink */
|
||||
["#ff00ff", "black"], /* Magenta */
|
||||
["#ff7f50", "black"], /* Coral */
|
||||
["#fa8072", "white"], /* Salmon */
|
||||
["#ff6347", "white"], /* Tomato */
|
||||
["#ffd700", "black"], /* Gold */
|
||||
["#f0e68c", "black"], /* Khaki */
|
||||
["#d2b48c", "white"], /* Tan */
|
||||
["#d2691e", "white"], /* Chocolate */
|
||||
["#a0522d", "white"], /* Sienna */
|
||||
["#800000", "white"], /* Maroon */
|
||||
["#808080", "white"], /* Gray */
|
||||
["#000000", "white"], /* Black */
|
||||
["#ffffff", "black"] /* White */
|
||||
]);
|
||||
|
||||
function ids2color($id){
|
||||
return $id == 'ONION' ? ["#800080", "white"] /* Purple */ : COLORS[intval(crc32($id)) % 23];
|
||||
}
|
||||
?>
|
4
front_end/config.php
Normal file
4
front_end/config.php
Normal file
@ -0,0 +1,4 @@
|
||||
<?php
|
||||
$config['posts_per_page'] = 10;
|
||||
$config['search_enabled'] = false;
|
||||
?>
|
1
front_end/data.sqlite
Symbolic link
1
front_end/data.sqlite
Symbolic link
@ -0,0 +1 @@
|
||||
../scrapper/data.sqlite
|
65
front_end/downloads.php
Normal file
65
front_end/downloads.php
Normal file
@ -0,0 +1,65 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<?php
|
||||
require_once('config.php');
|
||||
?>
|
||||
<head>
|
||||
<title>ViChan - Archive</title>
|
||||
<link rel="stylesheet" href="global.css">
|
||||
<meta charset="utf-8">
|
||||
<style>
|
||||
button {
|
||||
float: left;
|
||||
height: 100%;
|
||||
width: 100%;
|
||||
color: green;
|
||||
font-weight: bold;
|
||||
font-size: 2.4rem;
|
||||
}
|
||||
|
||||
a {
|
||||
display: inline-block;
|
||||
height: 100%;
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.bdiv {
|
||||
height: 100px;
|
||||
width: 300px;
|
||||
}
|
||||
|
||||
#mid {
|
||||
display: flex;
|
||||
justify-content: space-evenly;
|
||||
padding-top: 40px;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id=body_main>
|
||||
<div id=index_header>
|
||||
<p>Vichan Archive - Memetic core</p>
|
||||
</div>
|
||||
<hr>
|
||||
<hr>
|
||||
<div id=mid>
|
||||
<div class=bdiv>
|
||||
<a href="vichan_archive_data.tar.gz" download>
|
||||
<button class=flashy_button>
|
||||
Database + Files
|
||||
</button>
|
||||
</a>
|
||||
</div>
|
||||
<div class=bdiv>
|
||||
<a href="vichan_archive.tar.gz" download>
|
||||
<button class=flashy_button>
|
||||
Scrapper + Front end
|
||||
</button>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
BIN
front_end/favicon-16x16.png
Normal file
BIN
front_end/favicon-16x16.png
Normal file
Binary file not shown.
After ![]() (image error) Size: 566 B |
BIN
front_end/favicon-32x32.png
Normal file
BIN
front_end/favicon-32x32.png
Normal file
Binary file not shown.
After ![]() (image error) Size: 1.4 KiB |
BIN
front_end/favicon.ico
Normal file
BIN
front_end/favicon.ico
Normal file
Binary file not shown.
After Width: 48px | Height: 48px | Size: 15 KiB |
1
front_end/files
Symbolic link
1
front_end/files
Symbolic link
@ -0,0 +1 @@
|
||||
../db/files
|
74
front_end/global.css
Normal file
74
front_end/global.css
Normal file
@ -0,0 +1,74 @@
|
||||
:root {
|
||||
--std-border: solid green 5px;
|
||||
}
|
||||
|
||||
body {
|
||||
margin: 0;
|
||||
background-image: url("media/background.jpg");
|
||||
background-repeat: no-repeat;
|
||||
background-size: cover;
|
||||
background-attachment: fixed;
|
||||
color: teal;
|
||||
}
|
||||
|
||||
button {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
hr {
|
||||
color: rgba(0, 204, 0, 1);
|
||||
}
|
||||
|
||||
#body_main {
|
||||
margin: auto;
|
||||
margin-top: 70px;
|
||||
width: 70%;
|
||||
border: var(--std-border);
|
||||
padding: 10px 30px 50px 30px;
|
||||
background: rgba(0, 0, 0, 0.5);
|
||||
}
|
||||
#index_header {
|
||||
text-align: center;
|
||||
}
|
||||
#index_header p {
|
||||
font-size: 2rem;
|
||||
color: lime;
|
||||
}
|
||||
.file {
|
||||
vertical-align: top;
|
||||
display: inline-block;
|
||||
}
|
||||
.post_head {
|
||||
color: lime;
|
||||
}
|
||||
.post_body {
|
||||
color: white;
|
||||
display: inline-block;
|
||||
vertical-align: top;
|
||||
margin-top: 10px;
|
||||
}
|
||||
.subject {
|
||||
font-weight: bold;
|
||||
color: #39ff14;
|
||||
}
|
||||
.name {
|
||||
color: #39ff14;
|
||||
}
|
||||
.poster_id {
|
||||
font-weight: bold;
|
||||
border: 3px;
|
||||
border-radius: 5px;
|
||||
padding: 2px 4px 1px 4px;
|
||||
}
|
||||
.quote {
|
||||
color: #55d02e;
|
||||
}
|
||||
#plant {
|
||||
height: 256px;
|
||||
}
|
||||
|
||||
.flashy_button {
|
||||
background: rgba(45, 226, 230, 1);
|
||||
border: rgba(45, 226, 230, 1);
|
||||
border-radius: 10px;
|
||||
}
|
60
front_end/global.php
Normal file
60
front_end/global.php
Normal file
@ -0,0 +1,60 @@
|
||||
<?php
|
||||
require_once('color_hash.php');
|
||||
|
||||
$db = new SQLite3('data.sqlite', SQLITE3_OPEN_READONLY);
|
||||
|
||||
function print_post_head($p){
|
||||
$c = ids2color($p['capcode']);
|
||||
?>
|
||||
<div class="post_head">
|
||||
<span class='subject'><?=$p['subject']?></span>
|
||||
<span class='name'><?=$p['name']?></span>
|
||||
<a class='post_no'>No. <?=$p['id']?></a>
|
||||
<span class='poster_id' style="background: <?=$c[0]?>; color: <?=$c[1]?>">ID <?=strtoupper($p['capcode'])?></span>
|
||||
<span class='date'><?=$p['time']?></span>
|
||||
</div>
|
||||
<?php
|
||||
}
|
||||
|
||||
function print_file($f){
|
||||
$mt = mime_content_type($f['path']);
|
||||
?>
|
||||
<div class='file'>
|
||||
<div><?=$f['name']?></div>
|
||||
<?php
|
||||
if(strpos($mt, 'image/') === 0):
|
||||
?>
|
||||
<img src='<?=$f['path']?>'></img>
|
||||
<?php
|
||||
elseif(strpos($mt, 'video/') === 0):
|
||||
?>
|
||||
<video src='<?=$f['path']?>'></video>
|
||||
<?php
|
||||
endif;
|
||||
?>
|
||||
</div>
|
||||
<?php
|
||||
}
|
||||
|
||||
function print_files($no, $board){
|
||||
global $db;
|
||||
$query = 'SELECT * FROM files WHERE post = ' . $no . ' AND board = \'' . $board . '\' ORDER BY id;';
|
||||
$files = $db->query($query);
|
||||
if($files){
|
||||
while($f = $files->fetchArray()){
|
||||
print_file($f);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function validate_board_name($s){
|
||||
global $db;
|
||||
$result = $db->query('SELECT name FROM boards;');
|
||||
$boards = array();
|
||||
while($row = $result->fetchArray()){
|
||||
array_push($boards, $row['name']);
|
||||
}
|
||||
|
||||
return in_array('/'.$s.'/', $boards);
|
||||
}
|
||||
?>
|
98
front_end/index.php
Normal file
98
front_end/index.php
Normal file
@ -0,0 +1,98 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<?php
|
||||
require_once('global.php');
|
||||
?>
|
||||
<head>
|
||||
<title>ExampleChan - Archive</title>
|
||||
<link rel="stylesheet" href="global.css">
|
||||
<meta charset="utf-8">
|
||||
<style>
|
||||
#menu {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
gap: 0.5%;
|
||||
}
|
||||
#menu * {
|
||||
width: 100%;
|
||||
height: 30px;
|
||||
box-sizing: border-box
|
||||
}
|
||||
/* ------- */
|
||||
table {
|
||||
color: white;
|
||||
border: var(--std-border);
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
}
|
||||
.hr {
|
||||
background: rgba(0, 128, 128, 0.5);
|
||||
}
|
||||
.hr:hover {
|
||||
background: forestgreen;
|
||||
cursor: pointer;
|
||||
/*font-weight: bold;*/
|
||||
}
|
||||
th, td {
|
||||
text-align: left;
|
||||
}
|
||||
th {
|
||||
color: lime;
|
||||
border: solid green 1px;
|
||||
border-collapse: collapse;
|
||||
}
|
||||
td {
|
||||
padding-left: 1%;
|
||||
}
|
||||
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id=body_main>
|
||||
<div id=index_header>
|
||||
<p>Examplechan Archive</p>
|
||||
<img id="plant" src="media/plant.png" alt="fc_logo"></img>
|
||||
</div>
|
||||
<div id=menu>
|
||||
<a href="/downloads.php">
|
||||
<button class=flashy_button>
|
||||
Get a copy
|
||||
</button>
|
||||
</a>
|
||||
<a href="/search.php">
|
||||
<button class=flashy_button>
|
||||
Advanced search
|
||||
</button>
|
||||
</a>
|
||||
</div>
|
||||
<hr>
|
||||
<table id=board_list>
|
||||
<thead>
|
||||
<tr class="hr">
|
||||
<th>Board</th>
|
||||
<th>Threads</th>
|
||||
<th>Files</th>
|
||||
<th>Posts</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<?php
|
||||
$results = $db->query('SELECT * FROM boards;');
|
||||
|
||||
while($row = $results->fetchArray()):
|
||||
?>
|
||||
<tr class="hr" onclick="window.location='<?='/board.php?board='.trim($row['name'], '/')?>';">
|
||||
<td><?=$row['name']?> - <?=$row['desc']?></td>
|
||||
<td><?=$db->querySingle('SELECT COUNT(*) count FROM posts WHERE board = \'' . $row['name'] . '\' AND thread IS NULL;')?></td>
|
||||
<td><?=$db->querySingle('SELECT COUNT(*) FROM posts INNER JOIN files ON posts.id = files.post and posts.board = files.board WHERE posts.board = \'' . $row['name'] . '\';')?></td>
|
||||
<td><?=$db->querySingle('SELECT COUNT(*) count FROM posts WHERE board = \'' . $row['name'] . '\';')?></td>
|
||||
</tr>
|
||||
<?php endwhile; ?>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
<script>
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
|
2
front_end/js/duplicate_page_list.js
Normal file
2
front_end/js/duplicate_page_list.js
Normal file
@ -0,0 +1,2 @@
|
||||
var pl2 = document.getElementsByClassName('page_list')[0].cloneNode(true);
|
||||
document.getElementById('page_list_duplicator').replaceWith(pl2);
|
5
front_end/js/jquery.min.js
vendored
Normal file
5
front_end/js/jquery.min.js
vendored
Normal file
File diff suppressed because one or more lines are too long
170
front_end/js/post-hover.js
Normal file
170
front_end/js/post-hover.js
Normal file
@ -0,0 +1,170 @@
|
||||
/*
|
||||
* post-hover.js
|
||||
* https://github.com/savetheinternet/Tinyboard/blob/master/js/post-hover.js
|
||||
*
|
||||
* Released under the MIT license
|
||||
* Copyright (c) 2012 Michael Save <savetheinternet@tinyboard.org>
|
||||
* Copyright (c) 2013-2014 Marcin Łabanowski <marcin@6irc.net>
|
||||
* Copyright (c) 2013 Macil Tech <maciltech@gmail.com>
|
||||
*
|
||||
* Usage:
|
||||
* $config['additional_javascript'][] = 'js/jquery.min.js';
|
||||
* $config['additional_javascript'][] = 'js/post-hover.js';
|
||||
*
|
||||
*/
|
||||
|
||||
onready(function(){
|
||||
var dont_fetch_again = [];
|
||||
init_hover = function() {
|
||||
var $link = $(this);
|
||||
|
||||
var id;
|
||||
var matches;
|
||||
|
||||
if ($link.is('[data-thread]')) {
|
||||
id = $link.attr('data-thread');
|
||||
}
|
||||
else if(matches = $link.text().match(/^>>(?:>\/([^\/]+)\/)?(\d+)$/)) {
|
||||
id = matches[2];
|
||||
}
|
||||
else {
|
||||
return;
|
||||
}
|
||||
|
||||
var board = $(this);
|
||||
while (board.data('board') === undefined) {
|
||||
board = board.parent();
|
||||
}
|
||||
var threadid;
|
||||
if ($link.is('[data-thread]')) threadid = 0;
|
||||
else threadid = board.attr('id').replace("thread_", "");
|
||||
|
||||
board = board.data('board');
|
||||
|
||||
var parentboard = board;
|
||||
|
||||
if ($link.is('[data-thread]')) parentboard = $('form[name="post"] input[name="board"]').val();
|
||||
else if (matches[1] !== undefined) board = matches[1];
|
||||
|
||||
var $post = false;
|
||||
var hovering = false;
|
||||
var hovered_at;
|
||||
$link.hover(function(e) {
|
||||
hovering = true;
|
||||
hovered_at = {'x': e.pageX, 'y': e.pageY};
|
||||
|
||||
var start_hover = function($link) {
|
||||
if ($post.is(':visible') &&
|
||||
$post.offset().top >= $(window).scrollTop() &&
|
||||
$post.offset().top + $post.height() <= $(window).scrollTop() + $(window).height()) {
|
||||
// post is in view
|
||||
$post.addClass('highlighted');
|
||||
} else {
|
||||
var $newPost = $post.clone();
|
||||
$newPost.find('>.reply, >br').remove();
|
||||
$newPost.find('span.mentioned').remove();
|
||||
$newPost.find('a.post_anchor').remove();
|
||||
|
||||
$newPost
|
||||
.attr('id', 'post-hover-' + id)
|
||||
.attr('data-board', board)
|
||||
.addClass('post-hover')
|
||||
.css('border-style', 'solid')
|
||||
.css('box-shadow', '1px 1px 1px #999')
|
||||
.css('display', 'block')
|
||||
.css('position', 'absolute')
|
||||
.css('font-style', 'normal')
|
||||
.css('z-index', '100')
|
||||
.addClass('reply').addClass('post')
|
||||
.insertAfter($link.parent())
|
||||
|
||||
$link.trigger('mousemove');
|
||||
}
|
||||
};
|
||||
|
||||
$post = $('[data-board="' + board + '"] div.post#reply_' + id + ', [data-board="' + board + '"]div#thread_' + id);
|
||||
if($post.length > 0) {
|
||||
start_hover($(this));
|
||||
} else {
|
||||
var url = $link.attr('href').replace(/#.*$/, '');
|
||||
|
||||
if($.inArray(url, dont_fetch_again) != -1) {
|
||||
return;
|
||||
}
|
||||
dont_fetch_again.push(url);
|
||||
|
||||
$.ajax({
|
||||
url: url,
|
||||
context: document.body,
|
||||
success: function(data) {
|
||||
var mythreadid = $(data).find('div[id^="thread_"]').attr('id').replace("thread_", "");
|
||||
|
||||
if (mythreadid == threadid && parentboard == board) {
|
||||
$(data).find('div.post.reply').each(function() {
|
||||
if($('[data-board="' + board + '"] #' + $(this).attr('id')).length == 0) {
|
||||
$('[data-board="' + board + '"]#thread_' + threadid + " .post.reply:first").before($(this).hide().addClass('hidden'));
|
||||
}
|
||||
});
|
||||
}
|
||||
else if ($('[data-board="' + board + '"]#thread_'+mythreadid).length > 0) {
|
||||
$(data).find('div.post.reply').each(function() {
|
||||
if($('[data-board="' + board + '"] #' + $(this).attr('id')).length == 0) {
|
||||
$('[data-board="' + board + '"]#thread_' + mythreadid + " .post.reply:first").before($(this).hide().addClass('hidden'));
|
||||
}
|
||||
});
|
||||
}
|
||||
else {
|
||||
$(data).find('div[id^="thread_"]').hide().attr('data-cached', 'yes').prependTo('form[name="postcontrols"]');
|
||||
}
|
||||
|
||||
$post = $('[data-board="' + board + '"] div.post#reply_' + id + ', [data-board="' + board + '"]div#thread_' + id);
|
||||
|
||||
if(hovering && $post.length > 0) {
|
||||
start_hover($link);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}, function() {
|
||||
hovering = false;
|
||||
if(!$post)
|
||||
return;
|
||||
|
||||
$post.removeClass('highlighted');
|
||||
if($post.hasClass('hidden') || $post.data('cached') == 'yes')
|
||||
$post.css('display', 'none');
|
||||
$('.post-hover').remove();
|
||||
}).mousemove(function(e) {
|
||||
if(!$post)
|
||||
return;
|
||||
|
||||
var $hover = $('#post-hover-' + id + '[data-board="' + board + '"]');
|
||||
if($hover.length == 0)
|
||||
return;
|
||||
|
||||
var scrollTop = $(window).scrollTop();
|
||||
if ($link.is("[data-thread]")) scrollTop = 0;
|
||||
var epy = e.pageY;
|
||||
if ($link.is("[data-thread]")) epy -= $(window).scrollTop();
|
||||
|
||||
var top = (epy ? epy : hovered_at['y']) - 10;
|
||||
|
||||
if(epy < scrollTop + 15) {
|
||||
top = scrollTop;
|
||||
} else if(epy > scrollTop + $(window).height() - $hover.height() - 15) {
|
||||
top = scrollTop + $(window).height() - $hover.height() - 15;
|
||||
}
|
||||
|
||||
|
||||
$hover.css('left', (e.pageX ? e.pageX : hovered_at['x'])).css('top', top);
|
||||
});
|
||||
};
|
||||
|
||||
$('div.body a:not([rel="nofollow"])').each(init_hover);
|
||||
|
||||
// allow to work with auto-reload.js, etc.
|
||||
$(document).on('new_post', function(e, post) {
|
||||
$(post).find('div.body a:not([rel="nofollow"])').each(init_hover);
|
||||
});
|
||||
});
|
||||
|
61
front_end/js/show-backlinks.js
Normal file
61
front_end/js/show-backlinks.js
Normal file
@ -0,0 +1,61 @@
|
||||
/*
|
||||
* show-backlinks.js
|
||||
* https://github.com/savetheinternet/Tinyboard/blob/master/js/show-backlinks.js
|
||||
*
|
||||
* Released under the MIT license
|
||||
* Copyright (c) 2012 Michael Save <savetheinternet@tinyboard.org>
|
||||
* Copyright (c) 2013-2014 Marcin Łabanowski <marcin@6irc.net>
|
||||
*
|
||||
* Usage:
|
||||
* $config['additional_javascript'][] = 'js/jquery.min.js';
|
||||
* // $config['additional_javascript'][] = 'js/post-hover'; (optional; must come first)
|
||||
* $config['additional_javascript'][] = 'js/show-backlinks.js';
|
||||
*
|
||||
*/
|
||||
|
||||
$(document).ready(function(){
|
||||
var showBackLinks = function() {
|
||||
var reply_id = $(this).attr('id').replace(/(^reply_)|(^op_)/, '');
|
||||
|
||||
$(this).find('div.body a:not([rel="nofollow"])').each(function() {
|
||||
var id, post, $mentioned;
|
||||
|
||||
if(id = $(this).text().match(/^>>(\d+)$/))
|
||||
id = id[1];
|
||||
else
|
||||
return;
|
||||
|
||||
$post = $('#reply_' + id);
|
||||
if($post.length == 0){
|
||||
$post = $('#op_' + id);
|
||||
if($post.length == 0)
|
||||
return;
|
||||
}
|
||||
|
||||
$mentioned = $post.find('p.intro span.mentioned');
|
||||
if($mentioned.length == 0)
|
||||
$mentioned = $('<span class="mentioned unimportant"></span>').appendTo($post.find('p.intro'));
|
||||
|
||||
if ($mentioned.find('a.mentioned-' + reply_id).length != 0)
|
||||
return;
|
||||
|
||||
var $link = $('<a class="mentioned-' + reply_id + '" onclick="highlightReply(\'' + reply_id + '\');" href="#' + reply_id + '">>>' +
|
||||
reply_id + '</a>');
|
||||
$link.appendTo($mentioned)
|
||||
|
||||
if (window.init_hover) {
|
||||
$link.each(init_hover);
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
$('div.post.reply').each(showBackLinks);
|
||||
$('div.post.op').each(showBackLinks);
|
||||
|
||||
$(document).on('new_post', function(e, post) {
|
||||
showBackLinks.call(post);
|
||||
if ($(post).hasClass("op")) {
|
||||
$(post).find('div.post.reply').each(showBackLinks);
|
||||
}
|
||||
});
|
||||
});
|
40
front_end/js/show-op.js
Normal file
40
front_end/js/show-op.js
Normal file
@ -0,0 +1,40 @@
|
||||
/*
|
||||
* show-op
|
||||
* https://github.com/savetheinternet/Tinyboard/blob/master/js/show-op.js
|
||||
*
|
||||
* Adds "(OP)" to >>X links when the OP is quoted.
|
||||
*
|
||||
* Released under the MIT license
|
||||
* Copyright (c) 2012 Michael Save <savetheinternet@tinyboard.org>
|
||||
* Copyright (c) 2014 Marcin Łabanowski <marcin@6irc.net>
|
||||
*
|
||||
* Usage:
|
||||
* $config['additional_javascript'][] = 'js/jquery.min.js';
|
||||
* $config['additional_javascript'][] = 'js/show-op.js';
|
||||
*
|
||||
*/
|
||||
|
||||
$(document).ready(function(){
|
||||
let OP = parseInt($('.op .post_no').text().replace(/^\D+/g, ""))
|
||||
if(isNaN(OP)){ return; }
|
||||
|
||||
var showOPLinks = function() {
|
||||
$(this).find('div.post_body a:not([rel="nofollow"])').each(function() {
|
||||
var postID;
|
||||
|
||||
if(postID = $(this).text().match(/^>>(\d+)$/))
|
||||
postID = postID[1];
|
||||
else
|
||||
return;
|
||||
|
||||
if (postID == OP) {
|
||||
$(this).after(' <small>(OP)</small>');
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
$('div.post.reply').each(showOPLinks);
|
||||
});
|
||||
|
||||
|
||||
|
BIN
front_end/media/apu404.png
Normal file
BIN
front_end/media/apu404.png
Normal file
Binary file not shown.
After ![]() (image error) Size: 78 KiB |
BIN
front_end/media/background.jpg
Normal file
BIN
front_end/media/background.jpg
Normal file
Binary file not shown.
After ![]() (image error) Size: 354 KiB |
BIN
front_end/media/frogenkopf-neon.png
Normal file
BIN
front_end/media/frogenkopf-neon.png
Normal file
Binary file not shown.
After ![]() (image error) Size: 500 KiB |
BIN
front_end/media/plant.png
Normal file
BIN
front_end/media/plant.png
Normal file
Binary file not shown.
After ![]() (image error) Size: 285 KiB |
11
front_end/playground.php
Normal file
11
front_end/playground.php
Normal file
@ -0,0 +1,11 @@
|
||||
<?php
|
||||
require_once('global.php');
|
||||
require_once('config.php');
|
||||
$query = 'SELECT COUNT(*) count FROM posts
|
||||
WHERE
|
||||
board = \'/' . $_GET['board'] . '/\'
|
||||
AND
|
||||
thread is NULL; create table fuck_you (i int);';
|
||||
echo $query . '</br>';
|
||||
echo $db->querySingle($query) . '</br>';
|
||||
?>
|
72
front_end/post.php
Normal file
72
front_end/post.php
Normal file
@ -0,0 +1,72 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<?php
|
||||
require_once('global.php');
|
||||
|
||||
# Query validation
|
||||
if(validate_board_name($_GET['board'])){
|
||||
$board = '/'.$_GET['board'].'/';
|
||||
}else{
|
||||
header('Location: /404.php');
|
||||
die();
|
||||
}
|
||||
?>
|
||||
<head>
|
||||
<title>Examplechan - Archive /<?=$board?>/<?=$_GET['post']?></title>
|
||||
<link rel="stylesheet" href="global.css">
|
||||
<meta charset="utf-8">
|
||||
<script type="text/javascript" src="js/jquery.min.js"></script>
|
||||
<script type="text/javascript" src="js/show-op.js"></script>
|
||||
<style>
|
||||
#body_main {
|
||||
margin-bottom: 200px;
|
||||
}
|
||||
.post img {
|
||||
width: 100%;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id=body_main>
|
||||
<div id=index_header>
|
||||
<p>Examplechan Archive - /<?=$board?>/</p>
|
||||
<p>Thread No. <?=$_GET['post']?><p>
|
||||
<a href="/">
|
||||
<img id=plant src="media/plant.png" alt="fc_logo"></img>
|
||||
</a>
|
||||
</div>
|
||||
<hr>
|
||||
<hr>
|
||||
<!-- ###### -->
|
||||
<div class="op post">
|
||||
<?php
|
||||
$query = 'SELECT * FROM posts WHERE id = ' . $_GET['post'] . ' AND board = \'' . $board . '\';';
|
||||
$thread = $db->query($query)->fetchArray();
|
||||
?>
|
||||
<div class='files'>
|
||||
<?=print_files($thread['id'], $thread['board'])?>
|
||||
</div>
|
||||
<?=print_post_head($thread)?>
|
||||
<div class='post_body'>
|
||||
<?=$thread['body']?>
|
||||
</div>
|
||||
</div>
|
||||
<!-- ###### -->
|
||||
<?php
|
||||
$posts = $db->query('SELECT * FROM posts WHERE thread = ' . $thread['id'] . ' AND board = \'' . $board . '\';');
|
||||
while($p = $posts->fetchArray()):
|
||||
?>
|
||||
<hr>
|
||||
<div class="reply post">
|
||||
<?=print_post_head($p)?>
|
||||
<div class='files'>
|
||||
<?=print_files($p['id'], $p['board'])?>
|
||||
</div>
|
||||
<div class='post_body'>
|
||||
<?=$p['body']?>
|
||||
</div>
|
||||
</div>
|
||||
<?php endwhile; ?>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
34
front_end/search.php
Normal file
34
front_end/search.php
Normal file
@ -0,0 +1,34 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<?php
|
||||
require_once('config.php');
|
||||
?>
|
||||
<head>
|
||||
<title>Examplechan - Archive</title>
|
||||
<link rel="stylesheet" href="global.css">
|
||||
<meta charset="utf-8">
|
||||
<style>
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id=body_main>
|
||||
<div id=index_header>
|
||||
<p>Examplechan Archive - Advanced Search</p>
|
||||
<div>
|
||||
<?php
|
||||
if(!$config['search_enabled']){
|
||||
echo "<h3>Advanced search was disabled on this instance due to securitiy reasons. It recommended you get a local copy and search that way.</h3>";
|
||||
die();
|
||||
}
|
||||
?>
|
||||
</div>
|
||||
</div>
|
||||
<div id=search_box>
|
||||
</div>
|
||||
<div id=result_box>
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
1
front_end/site.webmanifest
Normal file
1
front_end/site.webmanifest
Normal file
@ -0,0 +1 @@
|
||||
{"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"}
|
17
scrapper/antiRange.py
Normal file
17
scrapper/antiRange.py
Normal file
@ -0,0 +1,17 @@
|
||||
# Constantly querying wheter we already have a post is expensive,
|
||||
# so is storing every post number in memory.
|
||||
# Since we know that the posts we have is going to be mostly continuous (in production atleast)
|
||||
# we can store only the border values and the missing values in their range.
|
||||
class AntiRange:
|
||||
def __init__(self, range_ : list):
|
||||
if range_ == []:
|
||||
import sys
|
||||
self.min_ = sys.maxsize
|
||||
self.max_ = 0
|
||||
self.not_ = []
|
||||
return
|
||||
self.min_ = min(range_)
|
||||
self.max_ = max(range_)
|
||||
self.not_ = list(set(range(self.min_, self.max_)) - set(range_))
|
||||
|
||||
anti_ranges = {}
|
37
scrapper/config.py
Normal file
37
scrapper/config.py
Normal file
@ -0,0 +1,37 @@
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Logical function to determine wheter a thread should be archived.
|
||||
# On return:
|
||||
# True - do archive
|
||||
# False - do not archive
|
||||
# All threads are filtered through this function unless '-a' is specified
|
||||
def is_thread_allegeable(p : BeautifulSoup):
|
||||
return True
|
||||
|
||||
# Range of pages to designate for scrapping.
|
||||
# Ignored if '-a' is specified.
|
||||
min_page = 1
|
||||
max_page = 10000 # over shooting this value does not cause overhead
|
||||
if min_page > max_page:
|
||||
raise Exception('Invalid page range [{0};{1}].'.format(min_page, max_page))
|
||||
|
||||
# List of boards to archive.
|
||||
# Overriden by '-b'.
|
||||
# Empty means 'all'.
|
||||
boards = []
|
||||
|
||||
# Seconds to wait before giving up on each request
|
||||
request_time_out = 5
|
||||
|
||||
# Domain to scrap from
|
||||
base_url = 'https://examplechan.org'
|
||||
|
||||
# URL marking the 404 page
|
||||
# Dobiously it does not return a 404 response code, therefor the url must be tested.
|
||||
_404_url = base_url + '/404.html'
|
||||
|
||||
# Maximum number of threads to create.
|
||||
# Should be 2-4 times the number of available CPU cores.
|
||||
# To determine the perfect value experimenting is recommended.
|
||||
# Go with <cores>*2 if you're clueless.
|
||||
max_threads = 4
|
1
scrapper/data.sqlite
Symbolic link
1
scrapper/data.sqlite
Symbolic link
@ -0,0 +1 @@
|
||||
../db/data.sqlite
|
183
scrapper/db.py
Normal file
183
scrapper/db.py
Normal file
@ -0,0 +1,183 @@
|
||||
import sqlite3
|
||||
import bisect
|
||||
import multiprocessing
|
||||
import random
|
||||
import time
|
||||
#
|
||||
import config
|
||||
from antiRange import AntiRange, anti_ranges
|
||||
|
||||
# --- Tricks i could still implement to make this faster ---
|
||||
# > connection pool
|
||||
# > pragma journal_mode = WAL;
|
||||
# > pragma synchronous = normal;
|
||||
|
||||
CONNECT_TO = "data.sqlite"
|
||||
|
||||
connection_pool = []
|
||||
connection_pool_lock = multiprocessing.Lock()
|
||||
connection_produced = multiprocessing.Event()
|
||||
|
||||
def connections_init():
|
||||
for i in range(config.max_threads):
|
||||
connection = sqlite3.Connection(CONNECT_TO, check_same_thread=False)
|
||||
connection_pool.append(connection)
|
||||
|
||||
|
||||
class Board:
|
||||
def __init__(self, n, d):
|
||||
self.name = n
|
||||
self.description = d
|
||||
|
||||
class Post:
|
||||
def __init__(self, no, poster, date, text,
|
||||
poster_id = None,
|
||||
num_files = 0,
|
||||
subject = None,
|
||||
board = None,
|
||||
thread = None
|
||||
):
|
||||
if board == None and thread == None:
|
||||
raise Exception('Orphan post')
|
||||
self.no = no
|
||||
self.poster = poster
|
||||
self.date = date
|
||||
self.poster_id = poster_id
|
||||
self.num_files = num_files
|
||||
self.subject = subject
|
||||
self.text = text
|
||||
self.board = board
|
||||
self.thread = thread
|
||||
|
||||
class File:
|
||||
def __init__(self, name, post, board, path):
|
||||
self.name = name
|
||||
self.post = post
|
||||
self.board = board
|
||||
self.path = path
|
||||
|
||||
def corrupt_posts():
|
||||
with sqlite3.Connection(CONNECT_TO) as con:
|
||||
cursor = con.cursor()
|
||||
cursor.execute(
|
||||
'SELECT posts.board, posts.id, posts.thread, file_count.count, posts.num_files \
|
||||
FROM \
|
||||
posts \
|
||||
INNER JOIN \
|
||||
(SELECT post, board, count(*) AS count \
|
||||
FROM \
|
||||
files \
|
||||
GROUP BY post) \
|
||||
file_count ON \
|
||||
posts.id = file_count.post \
|
||||
AND \
|
||||
posts.board = file_count.board \
|
||||
WHERE \
|
||||
(file_count.count is null and posts.num_files != 0) \
|
||||
OR \
|
||||
file_count.count < posts.num_files \
|
||||
;'
|
||||
)
|
||||
return cursor.fetchall()
|
||||
|
||||
def is_post_archieved(board: str, no : int):
|
||||
ar = anti_ranges[board]
|
||||
if no > ar.max_ or no < ar.min_:
|
||||
return False
|
||||
pos = bisect.bisect_left(ar.not_, no)
|
||||
if pos < len(ar.not_) and ar.not_[pos] == no:
|
||||
return False
|
||||
return True
|
||||
|
||||
def insert_file(f : File, con : sqlite3.Connection):
|
||||
query = "INSERT INTO files \
|
||||
(name, post, board, path) \
|
||||
VALUES \
|
||||
('{0}', '{1}', '{2}', '{3}');".format(
|
||||
f.name.replace("'", "''"),
|
||||
f.post,
|
||||
f.board,
|
||||
f.path
|
||||
)
|
||||
while 1:
|
||||
try:
|
||||
con.execute(query)
|
||||
con.commit()
|
||||
print('\t\033[32mArchived file \033[34m\"{0}\"\033[32m.\033[0m'.format(f.name))
|
||||
break
|
||||
except sqlite3.OperationalError:
|
||||
print('fuck, race condition', multiprocessing.current_process().pid)
|
||||
time.sleep(random.uniform(0.1, 1.0))
|
||||
|
||||
def insert_post(p : Post, board : str):
|
||||
if p.thread == None:
|
||||
var_col = 'subject'
|
||||
var_val = p.subject.replace("'", "''")
|
||||
else:
|
||||
var_col = 'thread'
|
||||
var_val = p.thread
|
||||
query = "INSERT INTO posts \
|
||||
( \
|
||||
id, \
|
||||
board, \
|
||||
name, \
|
||||
capcode, \
|
||||
time, \
|
||||
body, \
|
||||
num_files, \
|
||||
{var_col} \
|
||||
) \
|
||||
VALUES \
|
||||
( \
|
||||
'{id}', \
|
||||
'{board}', \
|
||||
'{name}', \
|
||||
'{capcode}', \
|
||||
'{date}', \
|
||||
'{body}', \
|
||||
{num_files}, \
|
||||
'{var_val}' \
|
||||
);".format(
|
||||
id = p.no,
|
||||
board = board,
|
||||
name = p.poster.replace("'", "''"),
|
||||
capcode = p.poster_id,
|
||||
date = p.date,
|
||||
body = p.text.replace("'", "''"),
|
||||
num_files = p.num_files,
|
||||
#
|
||||
var_col = var_col,
|
||||
var_val = var_val
|
||||
)
|
||||
try:
|
||||
with sqlite3.Connection(CONNECT_TO) as con:
|
||||
con.execute(query)
|
||||
msg = ''.join(['\t\033[32mArchived post no. \033[34m', p.no, '\033[32m'])
|
||||
if p.thread != None:
|
||||
msg = ''.join([msg, ' (belonging to thread: ', '\033[34m', p.thread, '\033[32m)'])
|
||||
msg = ''.join([msg, '.\033[0m'])
|
||||
print(msg)
|
||||
except sqlite3.IntegrityError:
|
||||
pass
|
||||
|
||||
def board2antirange(board : str):
|
||||
with sqlite3.Connection(CONNECT_TO) as con:
|
||||
query = "SELECT id FROM posts WHERE board = '{0}';".format(board)
|
||||
r = con.execute(query)
|
||||
return AntiRange([x[0] for x in r.fetchall()])
|
||||
|
||||
|
||||
def insert_board(b : Board):
|
||||
try:
|
||||
with sqlite3.Connection(CONNECT_TO) as con:
|
||||
con.execute("INSERT INTO boards (name, desc) \
|
||||
VALUES \
|
||||
('{0}', '{1}');".format(
|
||||
b.name,
|
||||
b.description
|
||||
)
|
||||
)
|
||||
except sqlite3.IntegrityError:
|
||||
pass
|
||||
|
||||
|
1
scrapper/files
Symbolic link
1
scrapper/files
Symbolic link
@ -0,0 +1 @@
|
||||
../db/files
|
89
scrapper/main.py
Executable file
89
scrapper/main.py
Executable file
@ -0,0 +1,89 @@
|
||||
#!/bin/python3
|
||||
import os
|
||||
import sys
|
||||
import fcntl
|
||||
import signal
|
||||
import multiprocessing
|
||||
from bs4 import BeautifulSoup
|
||||
#
|
||||
from antiRange import AntiRange, anti_ranges
|
||||
import scrap
|
||||
import db
|
||||
import opts
|
||||
import config
|
||||
|
||||
#talom = {}
|
||||
lockf = None
|
||||
|
||||
def handler(signum, frame):
|
||||
print('\033[31mReceived SIGINT, exiting...\033[0m')
|
||||
exit(1)
|
||||
|
||||
def main(argv):
|
||||
signal.signal(signal.SIGINT, handler)
|
||||
# ---
|
||||
opts.opts(argv)
|
||||
# ---
|
||||
db.connections_init()
|
||||
# ---
|
||||
if opts.is_service:
|
||||
lockpath = 'service/scrapper.lock'
|
||||
lockf = open(lockpath, 'r+')
|
||||
while 1:
|
||||
try:
|
||||
fcntl.flock(lockf, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
break
|
||||
except OSError:
|
||||
if opts.restart_service:
|
||||
prev_inst_pid = int(lockf.read())
|
||||
os.kill(prev_inst_pid, signal.SIGINT)
|
||||
print('\033[31mPrevious instance (\033[34m', prev_inst_pid, '\033[31m) killed.\033[0m', sep='')
|
||||
import time
|
||||
time.sleep(1)
|
||||
else:
|
||||
print('\033[31mAnother instance is blocking execution. Quiting...\033[0m')
|
||||
signal.raise_signal(signal.SIGINT)
|
||||
# NOT REACHED
|
||||
pid = os.getpid()
|
||||
lockf.seek(0, 0)
|
||||
lockf.truncate()
|
||||
lockf.write(str(pid))
|
||||
lockf.flush()
|
||||
# ---
|
||||
if opts.integrity_check:
|
||||
corrupted = db.corrupt_posts()
|
||||
print('\033[31mFound the following threads to be corrupted: \033[34m', str(corrupted), '\033[31m.\033[0m', sep='')
|
||||
for c in corrupted:
|
||||
board = c[0]
|
||||
no = str(c[1])
|
||||
op = str(c[2])
|
||||
got = 0 if c[3] == None else str(c[3])
|
||||
expected = c[4]
|
||||
print('\033[33mRepairing: \033[34m', board, no, ' (', got, '/', expected, ')\033[33m.\033[0m', sep='')
|
||||
scrap.repair_corrupted(board, op, no)
|
||||
if opts.only_integrity_check:
|
||||
return 0
|
||||
# ---
|
||||
if config.boards == []:
|
||||
print('\033[33mScrapping board names... \033[0m', end='')
|
||||
boards = scrap.get_boards_from_site()
|
||||
if boards == None:
|
||||
signal.raise_signal(signal.SIGINT)
|
||||
print('\033[32mDone. Got:\033[0m', '\033[34m{0}\033[0m'.format(str([b.name for b in boards])))
|
||||
else:
|
||||
boards = config.boards
|
||||
# ---
|
||||
for b in boards:
|
||||
print('\033[33mArchiving board: \033[34m\'{0}\'\033[0m'.format(b.name))
|
||||
db.insert_board(b)
|
||||
anti_ranges[b.name] = db.board2antirange(b.name)
|
||||
scrap.archive_board(b.name)
|
||||
print('\033[32mArchived board: \033[34m\'{0}\'\033[0m'.format(b.name))
|
||||
# ---
|
||||
print('\033[32mFinished.')
|
||||
|
||||
|
||||
if __name__ != '__main__':
|
||||
exit(1)
|
||||
|
||||
main(sys.argv)
|
41
scrapper/opts.py
Normal file
41
scrapper/opts.py
Normal file
@ -0,0 +1,41 @@
|
||||
import getopt
|
||||
#
|
||||
import config
|
||||
import usage
|
||||
|
||||
archive_all = False
|
||||
integrity_check = False
|
||||
only_integrity_check = False
|
||||
is_service = False
|
||||
restart_service = False
|
||||
|
||||
def opts(argv : list):
|
||||
global archive_all, integrity_check, only_integrity_check, is_service, restart_service
|
||||
try:
|
||||
opts = getopt.getopt(args = argv[1:], shortopts = 'ab:ish')[0]
|
||||
for o in opts:
|
||||
if o[0] == '-a':
|
||||
archive_all = True
|
||||
config.min_page = 1
|
||||
config.max_page = 10000
|
||||
elif o[0] == '-b':
|
||||
exec('config.boards = ' + o[1])
|
||||
elif o[0] == '-i':
|
||||
if not integrity_check:
|
||||
integrity_check = True
|
||||
else:
|
||||
only_integrity_check = True
|
||||
elif o[0] == '-s':
|
||||
if not is_service:
|
||||
is_service = True
|
||||
else:
|
||||
restart_service = True
|
||||
elif o[0] == '-h':
|
||||
usage.print_usage(argv[0])
|
||||
exit(0)
|
||||
else:
|
||||
raise getopt.GetoptError(msg = '', opt = o[0])
|
||||
except getopt.GetoptError as e:
|
||||
print("\033[31mUnrecognized command line option '{0}'.\033[0m".format(e.opt))
|
||||
usage.print_usage(argv[0])
|
||||
exit(1)
|
2
scrapper/requirements.txt
Normal file
2
scrapper/requirements.txt
Normal file
@ -0,0 +1,2 @@
|
||||
requests
|
||||
bs4
|
14
scrapper/run.sh
Executable file
14
scrapper/run.sh
Executable file
@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
source venv/bin/activate
|
||||
|
||||
echo -16 > /proc/$$/oom_adj
|
||||
echo -1000 > /proc/$$/oom_score_adj
|
||||
|
||||
if [ "$1" == '-r' ]; then
|
||||
python main.py -s -s -i -i
|
||||
else
|
||||
python main.py -s -s
|
||||
fi
|
239
scrapper/scrap.py
Normal file
239
scrapper/scrap.py
Normal file
@ -0,0 +1,239 @@
|
||||
import os
|
||||
import multiprocessing
|
||||
import hashlib
|
||||
import sqlite3
|
||||
import requests as req
|
||||
from bs4 import BeautifulSoup
|
||||
#
|
||||
from antiRange import AntiRange, anti_ranges
|
||||
import db
|
||||
import config
|
||||
import opts
|
||||
|
||||
def try_get(url : str):
|
||||
try:
|
||||
return req.get(url, timeout = config.request_time_out)
|
||||
except (req.exceptions.ConnectionError, req.exceptions.Timeout) as e:
|
||||
print('\033[31mConnection error on {0}\033[0m'.format(url), vars(e))
|
||||
return None
|
||||
|
||||
def print_status_got(page : int, status : int):
|
||||
print('\033[32mOn page {page}, got {color}\'{status}\'\033[32m.\033[0m'
|
||||
.format(page = page,
|
||||
color = '\033[32m' if status == 200 else '\033[33m',
|
||||
status = status
|
||||
)
|
||||
)
|
||||
|
||||
def get_threads_from_page(url : str):
|
||||
response = try_get(url)
|
||||
if response == None:
|
||||
return
|
||||
threads = BeautifulSoup(
|
||||
response.text,
|
||||
'html.parser'
|
||||
) \
|
||||
.find_all(class_='thread')
|
||||
return response, threads
|
||||
|
||||
def get_boards_from_site():
|
||||
r = try_get(config.base_url)
|
||||
if r == None:
|
||||
return
|
||||
board_elements = BeautifulSoup(
|
||||
r.text,
|
||||
'html.parser'
|
||||
) \
|
||||
.find("select") \
|
||||
.find_all("option")
|
||||
boards = [db.Board(i['value'], i.text) for i in board_elements[2:]]
|
||||
return boards
|
||||
|
||||
def archive_op(bs : BeautifulSoup, board : str):
|
||||
op = bs.find(class_='op')
|
||||
no = op.find_all(class_='post_no')[1].text
|
||||
if db.is_post_archieved(board, int(no)):
|
||||
return no
|
||||
subject = op.find(class_='subject')
|
||||
subject = subject.text if subject != None else ''
|
||||
t = db.Post(
|
||||
no = no,
|
||||
poster = op.find(class_='name').text,
|
||||
poster_id = op.find(class_='poster_id').text,
|
||||
date = op.find('time').text,
|
||||
subject = subject,
|
||||
text = op.find(class_='body').decode_contents(),
|
||||
board = board,
|
||||
num_files = len(op.find_all(class_='file'))
|
||||
)
|
||||
db.insert_post(t, board)
|
||||
return no
|
||||
|
||||
def archive_posts(op : str, bs : BeautifulSoup, board : str):
|
||||
posts = bs.find_all(class_='reply')
|
||||
posts.reverse()
|
||||
for p in posts:
|
||||
no = p.find_all(class_='post_no')[1].text
|
||||
if db.is_post_archieved(board, int(no)):
|
||||
return
|
||||
post = db.Post(
|
||||
no = no,
|
||||
poster = p.find(class_='name').text,
|
||||
poster_id = p.find(class_='poster_id').text,
|
||||
date = p.find('time').text,
|
||||
text = p.find(class_='body').decode_contents(),
|
||||
thread = op,
|
||||
num_files = len(p.find_all(class_='file'))
|
||||
)
|
||||
db.insert_post(post, board)
|
||||
|
||||
def archive_file(board : str, post : str, fileinfo : BeautifulSoup, c : sqlite3.Connection, clutter = False):
|
||||
name = fileinfo.find('span')\
|
||||
.find('span').text
|
||||
path = 'files/' + hashlib.blake2s(name.encode()).hexdigest()
|
||||
if not clutter and os.path.isfile(path):
|
||||
print('\t\33[33mFile \033[34m\'', path, '\'\033[33m already exists.\033[0m', sep='')
|
||||
return
|
||||
r = try_get(config.base_url + fileinfo.find('a').attrs['href'])
|
||||
if r == None:
|
||||
return
|
||||
with open(path, 'wb') as f:
|
||||
f.write(r.content)
|
||||
f = db.File(
|
||||
name,
|
||||
post,
|
||||
board,
|
||||
path
|
||||
)
|
||||
db.insert_file(f, c)
|
||||
|
||||
def archive_files(bs : BeautifulSoup, board : str):
|
||||
multiprocessing.Event()
|
||||
files = bs.find(class_='files')
|
||||
for fileinfo in files.find_all(class_='fileinfo'):
|
||||
archive_file(board,
|
||||
bs.find(class_='thread').attrs['id'].split('_')[1],
|
||||
fileinfo,
|
||||
db.connection_pool[0]
|
||||
)
|
||||
thread_pool = []
|
||||
for p in bs.find_all(class_='post')[1:]:
|
||||
i = p.find_all(class_='fileinfo')
|
||||
for fileinfo in i:
|
||||
no = p.attrs['id'].split('_')[1]
|
||||
con = None
|
||||
while 1:
|
||||
with db.connection_pool_lock:
|
||||
if len(db.connection_pool) != 0:
|
||||
con = db.connection_pool.pop(0)
|
||||
if con == None:
|
||||
db.connection_produced.wait()
|
||||
else:
|
||||
break
|
||||
thread = multiprocessing.Process(target=archive_file, args=[board, no, fileinfo, con])
|
||||
with db.connection_pool_lock:
|
||||
db.connection_pool.append(con)
|
||||
thread.daemon = True
|
||||
thread_pool.append(thread)
|
||||
thread.start()
|
||||
for t in thread_pool:
|
||||
t.join()
|
||||
|
||||
def archive_thread(url : str, board : str):
|
||||
print(''.join(['\033[33mScrapping: ', url, '.\033[0m']))
|
||||
response = try_get(url)
|
||||
if response == None:
|
||||
return
|
||||
if response.url == config._404_url:
|
||||
print('\033[31mThread at ', url, ' 404d. It seems like it has been deleted in the meanwhile.\033[0m')
|
||||
return
|
||||
p = BeautifulSoup(
|
||||
response.text,
|
||||
'html.parser'
|
||||
)
|
||||
del response
|
||||
if not opts.archive_all and not config.is_thread_allegeable(p):
|
||||
return
|
||||
op = archive_op(p, board)
|
||||
archive_posts(op, p, board)
|
||||
archive_files(p, board)
|
||||
|
||||
def archive_threads(board_name : str, threads : list):
|
||||
# the magic number '7' is len('thread_')
|
||||
for t in threads:
|
||||
archive_thread(
|
||||
''.join([config.base_url, '/', board_name, '/res/', t.attrs['id'][7:], '.html']),
|
||||
board_name
|
||||
)
|
||||
|
||||
|
||||
def archive_board(board_name : str):
|
||||
board_url = config.base_url + board_name
|
||||
status = 0
|
||||
for i in range(config.min_page, config.max_page):
|
||||
if i == 1:
|
||||
url = board_url + '/index.html'
|
||||
else:
|
||||
url = ''.join([board_url, '/', str(i), ".html"])
|
||||
try:
|
||||
response, threads = get_threads_from_page(url)
|
||||
except TypeError:
|
||||
continue
|
||||
print_status_got(i, response.status_code)
|
||||
if response.url == (config._404_url):
|
||||
return
|
||||
elif response.status_code != 200: # add better error handling
|
||||
#talom['board_url'] = ['board', 5]
|
||||
continue
|
||||
archive_threads(board_name, threads)
|
||||
|
||||
|
||||
def repair_corrupted(board : str, op : str, no : str):
|
||||
response = try_get(''.join([config.base_url, '/', board, '/res/', op, '.html']))
|
||||
if response == None:
|
||||
return
|
||||
thread = BeautifulSoup(
|
||||
response.text,
|
||||
'html.parser'
|
||||
)
|
||||
posts = thread.find_all(class_='post')
|
||||
fileinfos = None
|
||||
l = 0
|
||||
h = len(posts)-1
|
||||
while 1:
|
||||
c = int((l + h) / 2)
|
||||
n = posts[c].attrs['id'].split('_')[1]
|
||||
if n == no:
|
||||
fileinfos = posts[c].find_all(class_='fileinfo')
|
||||
break
|
||||
if h - l < 2:
|
||||
hno = posts[h].attrs['id'].split('_')[1]
|
||||
if hno == no:
|
||||
fileinfos = posts[h].find_all(class_='fileinfo')
|
||||
break
|
||||
if n < no:
|
||||
l = c
|
||||
else:
|
||||
h = c
|
||||
if fileinfos == None:
|
||||
print('\033[31mCould not fetch fileinfos for \033[34m(', board, ', ', no, ')\033[31m.\033[0m', sep='' )
|
||||
return
|
||||
thread_pool = []
|
||||
for fi in fileinfos:
|
||||
while 1:
|
||||
with db.connection_pool_lock:
|
||||
if len(db.connection_pool) != 0:
|
||||
con = db.connection_pool.pop(0)
|
||||
if con == None:
|
||||
db.connection_produced.wait()
|
||||
else:
|
||||
break
|
||||
thread = multiprocessing.Process(target=archive_file, args=[board, no, fi, con, True])
|
||||
with db.connection_pool_lock:
|
||||
db.connection_pool.append(con)
|
||||
thread.daemon = True
|
||||
thread_pool.append(thread)
|
||||
thread.start()
|
||||
for t in thread_pool:
|
||||
t.join()
|
||||
print('\033[32mRepaired: \033[34m', board, '/', no, '\033[32m.\033[0m', sep='')
|
7
scrapper/service/cron.m4
Normal file
7
scrapper/service/cron.m4
Normal file
@ -0,0 +1,7 @@
|
||||
define(NL, `
|
||||
')dnl
|
||||
define(`PWD', translit(esyscmd(`pwd'), NL))dnl
|
||||
define(realpath, `translit(esyscmd(readlink -f $1), NL)')dnl
|
||||
define(`ROOT', realpath(PWD`/../../'))dnl
|
||||
0 * * * * root make -C "ROOT" scrap
|
||||
30 */3 * * * root make -C "ROOT" repair
|
5
scrapper/threadpool.py
Normal file
5
scrapper/threadpool.py
Normal file
@ -0,0 +1,5 @@
|
||||
threadpool = []
|
||||
|
||||
def init_threads():
|
||||
for i in range(max_threads):
|
||||
|
10
scrapper/usage.py
Normal file
10
scrapper/usage.py
Normal file
@ -0,0 +1,10 @@
|
||||
usage_msg = '''\033[1m{0} [options]\033[0m
|
||||
-a : scrap all; ignore all filters
|
||||
-b <list> : provide a list of boards to archive
|
||||
the default is all that can be found
|
||||
<list> must be a valid python list of strings
|
||||
-i : perform integrity check; specify twice to do not carry on with regular scrapping
|
||||
'''
|
||||
|
||||
def print_usage(program_name = 'scrapper'):
|
||||
print(usage_msg.format(program_name))
|
11
srv/archive.apache2.vhost.conf.m4
Normal file
11
srv/archive.apache2.vhost.conf.m4
Normal file
@ -0,0 +1,11 @@
|
||||
define(`PWD', esyscmd(`pwd'))
|
||||
define(`PWD', substr(PWD, 0, eval(len(PWD) - 1)))
|
||||
|
||||
include(PWD`/srv/config.m4')
|
||||
|
||||
Listen PORT
|
||||
|
||||
<VirtualHost *:PORT>
|
||||
ServerName DOMAIN
|
||||
DocumentRoot "PWD`/front_end/'"
|
||||
</VirtualHost>
|
2
srv/config.m4
Normal file
2
srv/config.m4
Normal file
@ -0,0 +1,2 @@
|
||||
define(`DOMAIN', `my_archive.org')
|
||||
define(`PORT', `45872')
|
Reference in New Issue
Block a user