This commit is contained in:
anon
2024-01-25 18:37:18 +01:00
commit ec30c7d15d
52 changed files with 1908 additions and 0 deletions

9
.gitignore vendored Normal file

@ -0,0 +1,9 @@
**/__pycache__/
scrapper/venv
scrapper/service/scrapper.lock
vichan_archive.tar.gz
vichan_archive_data.tar.gz
db/data.sqlite
db/data.sqlite.bak
db/files/*
db/files.bak

52
Makefile Normal file

@ -0,0 +1,52 @@
pass:
@echo pass
# ----
include config.mk
VHOSTS_D := $(APACHE_CONFIG_DIR)/vhosts.d/
SHELL := /bin/bash
clone:
tar -I 'gzip --best' -c db/data.sqlite db/files/ -f front_end/vichan_archive_data.tar.gz
git archive --output=front_end/vichan_archive.tar.gz master
init_db:
-mv db/data.sqlite db/data.sqlite.bak
-rm -r db/files.bak
-mv db/files db/files.bak
-mkdir db/files; touch db/files/.placeholder
cd db/; cat init.sql | sqlite3
init_python:
cd scrapper/; \
python -m venv venv; \
source venv/bin/activate; \
pip install -r requirements.txt
init: init_db init_python
server:
-mkdir $(VHOSTS_D)
m4 srv/archive.apache2.vhost.conf.m4 > $(VHOSTS_D)/archive.conf
service:
cd scrapper/service/; \
m4 cron.m4 > /etc/cron.d/fc_scrapper
scrap:
cd scrapper/; \
./run.sh
repair:
cd scrapper/; \
./run.sh -r
restore:
-rm db/data.sqlite
-cp db/data.sqlite.bak db/data.sqlite

60
README.md Normal file

@ -0,0 +1,60 @@
# Vichan Scrapper
> scrapper for archiving data from vichan instances with a minimalistic frontend included to ease local viewing
### Demo
![frontend\_index\_demo](docs/demo1.png)
![frontend\_board\_demo](docs/demo2.png)
### Disclaimer
The scrapper worked pretty well for the specific instance it was built around / tested on,
however it's possible that it will fail on different version (which is unknown)
and or configuration.
### Requirements
+ Python3
+ Sqlite3
+ Apache2 or PHP (see Installation/Server)
### Installation
1. Meet the requirements
#### Base
```sh
$ make init # initialize databse and python environment
$ make clone # create tarballs for distribution
```
##### Server
###### For yourself
+ If you don't have to have the front end to be publicly facing, its easiest to use the built in server of the PHP interpreter.
1. Navigate to the front end's directory
```sh
$ cd <vichan_scrapper>/front_end/
```
2. Deploy the PHP server
```sh
$ php -S localhost:8000
```
3. Access it through your browser on the address:
localhost:8000
###### In production
1. Run apache
2. Add configs
```sh
$ make server
```
##### Schedule scrapper
```sh
$ make service
```
+ the default is to scrapp every hour, and attempt to get missing files every 3.5 hours
+ for personilazation see scrapper/service/cron.m4
### Configuration
#### Scrapper
#### Front end
+ front\_end/config.php
- posts\_per\_page : int
- search\_enabled : boolean
#### Recommendations
+ use XFS

1
config.mk Normal file

@ -0,0 +1 @@
APACHE_CONFIG_DIR := /etc/apache2/vhosts.d/

50
db/init.sql Normal file

@ -0,0 +1,50 @@
.open data.sqlite
-- --------------------------------------------------------
--
-- Table structure for table `boards`
--
DROP TABLE IF EXISTS boards;
CREATE TABLE boards (name VARCHAR(10) PRIMARY KEY,
desc VARCHAR(24)
);
-- --------------------------------------------------------
--
-- Table structure for table `posts`
--
DROP TABLE IF EXISTS posts;
CREATE TABLE posts (
id INTEGER,
board INTEGER,
thread INT(11) DEFAULT NULL,
subject VARCHAR(100) DEFAULT NULL,
email VARCHAR(30) DEFAULT NULL,
name VARCHAR(35) DEFAULT NULL,
trip VARCHAR(15) DEFAULT NULL,
capcode VARCHAR(50) DEFAULT NULL,
body text,
time VARCHAR(30),
num_files INT(11) DEFAULT 0, -- Used for integrity checks, NOT redundant
--`filehash` text CHARACTER SET ascii
PRIMARY KEY (id, board)
);
-- --------------------------------------------------------
--
-- Table structure for table `files`
--
DROP TABLE IF EXISTS files;
CREATE TABLE files (
id INTEGER PRIMARY KEY,
name TEXT,
post INTEGER NOT NULL,
board INTEGER NOT NULL,
path text
);

File diff suppressed because one or more lines are too long

BIN
docs/demo1.png Normal file

Binary file not shown.

After

(image error) Size: 2.6 MiB

BIN
docs/demo2.png Normal file

Binary file not shown.

After

(image error) Size: 2.0 MiB

27
front_end/404.php Normal file

@ -0,0 +1,27 @@
<!DOCTYPE html>
<html>
<head>
<title>Example Archive - 404</title>
<link rel="stylesheet" href="global.css">
<meta charset="utf-8">
<style>
#_404_container {
display: flex;
justify-content: center;
}
#_404_container img {
width: 60%;
}
</style>
</head>
<body>
<div id=body_main>
<div id=index_header>
<p>Vichan Archive - 404</p>
</div>
<div id=_404_container>
<img id="404" src="media/apu404.png" alt="apu404"></img>
</div>
</div>
</body>
</html>

Binary file not shown.

After

(image error) Size: 24 KiB

Binary file not shown.

After

(image error) Size: 135 KiB

Binary file not shown.

After

(image error) Size: 22 KiB

110
front_end/board.php Normal file

@ -0,0 +1,110 @@
<!DOCTYPE html>
<html>
<?php
require_once('global.php');
require_once('config.php');
# Query validation
if(!isset($_GET['page'])){
$page = 1;
}else{
$page = intval($_GET['page']);
}
if(validate_board_name($_GET['board'])){
$board = $_GET['board'];
}else{
header('Location: /404.php');
die();
}
?>
<head>
<title>Examplechan - Archive /<?=$board?>/</title>
<link rel="stylesheet" href="global.css">
<meta charset="utf-8">
<style>
span {
color: #06df20;
}
.thread {
box-sizing: border-box;
padding: 20px;
}
.thread:hover {
background: teal;
cursor: pointer;
}
.thread img {
width: 200px;
}
.file {
max-height: 400px;
overflow-y: hidden;
}
.page_list {
text-align: center;
font-size: 2rem;
color: yellow;
font-weight: bold;
}
.page_list a:link {
color: lime;
font-weight: normal;
}
.page_list a:visited {
color: lightgreen;
}
</style>
</head>
<body>
<div id=body_main>
<div id=index_header>
<p>Examplechan Archive - /<?=$board?>/</p>
<a href="/">
<img id=plant src="media/plant.png" alt="fc_logo"></img>
</a>
</div>
<div class=page_list>
[
<?php
$post_count = $db->querySingle('SELECT COUNT(*) count FROM posts WHERE board = \'/' . $_GET['board'] . '/\' and thread is NULL;');
$page_count = ceil($post_count / $config['posts_per_page']);
for($i = 0; $i < $page_count; $i++):
?>
<a href="/board.php?board=<?=$board?>&page=<?=$i+1?>"><?=$i+1?></a>
<?php
endfor;
?>
]
</div>
<hr>
<hr>
<?php
$query = 'SELECT * FROM posts WHERE ' .
'board = \'/' . $board . '/\' ' .
'AND ' .
'thread IS NULL ' .
'ORDER BY id DESC ' .
'LIMIT ' . $config['posts_per_page'] . ' ' .
'OFFSET ' . ($config['posts_per_page']*($page-1)) . ';';
$results = $db->query($query);
while($row = $results->fetchArray()):
?>
<div class="thread" onclick="window.location='/post.php?board=<?=$board?>&post=<?=$row['id']?>';">
<div>
<?=print_post_head($row)?>
<div class='files'>
<?=print_files($row['id'], $row['board'])?>
</div>
<div class='post_body'>
<?=$row['body']?>
</div>
</div>
</div>
<hr>
<?php endwhile; ?>
<script id=page_list_duplicator type="text/javascript" src="js/duplicate_page_list.js"></script>
</div>
</body>
</html>

31
front_end/color_hash.php Normal file

@ -0,0 +1,31 @@
<?php
define('COLORS', [
["#ff0000", "white"], /* Red */
["#ffa500", "black"], /* Orange */
["#ffff00", "black"], /* Yellow */
["#00ff00", "black"], /* Lime */
["#008000", "white"], /* Green */
["#00ffff", "black"], /* Aquamarine */
["#00bfff", "white"], /* Cyan */
["#0000ff", "white"], /* Blue */
["#4b0082", "white"], /* Indigo */
["#ffc0cb", "black"], /* Pink */
["#ff00ff", "black"], /* Magenta */
["#ff7f50", "black"], /* Coral */
["#fa8072", "white"], /* Salmon */
["#ff6347", "white"], /* Tomato */
["#ffd700", "black"], /* Gold */
["#f0e68c", "black"], /* Khaki */
["#d2b48c", "white"], /* Tan */
["#d2691e", "white"], /* Chocolate */
["#a0522d", "white"], /* Sienna */
["#800000", "white"], /* Maroon */
["#808080", "white"], /* Gray */
["#000000", "white"], /* Black */
["#ffffff", "black"] /* White */
]);
function ids2color($id){
return $id == 'ONION' ? ["#800080", "white"] /* Purple */ : COLORS[intval(crc32($id)) % 23];
}
?>

4
front_end/config.php Normal file

@ -0,0 +1,4 @@
<?php
$config['posts_per_page'] = 10;
$config['search_enabled'] = false;
?>

1
front_end/data.sqlite Symbolic link

@ -0,0 +1 @@
../scrapper/data.sqlite

65
front_end/downloads.php Normal file

@ -0,0 +1,65 @@
<!DOCTYPE html>
<html>
<?php
require_once('config.php');
?>
<head>
<title>ViChan - Archive</title>
<link rel="stylesheet" href="global.css">
<meta charset="utf-8">
<style>
button {
float: left;
height: 100%;
width: 100%;
color: green;
font-weight: bold;
font-size: 2.4rem;
}
a {
display: inline-block;
height: 100%;
width: 100%;
}
.bdiv {
height: 100px;
width: 300px;
}
#mid {
display: flex;
justify-content: space-evenly;
padding-top: 40px;
}
</style>
</head>
<body>
<div id=body_main>
<div id=index_header>
<p>Vichan Archive - Memetic core</p>
</div>
<hr>
<hr>
<div id=mid>
<div class=bdiv>
<a href="vichan_archive_data.tar.gz" download>
<button class=flashy_button>
Database + Files
</button>
</a>
</div>
<div class=bdiv>
<a href="vichan_archive.tar.gz" download>
<button class=flashy_button>
Scrapper + Front end
</button>
</a>
</div>
</div>
</div>
<script>
</script>
</body>
</html>

BIN
front_end/favicon-16x16.png Normal file

Binary file not shown.

After

(image error) Size: 566 B

BIN
front_end/favicon-32x32.png Normal file

Binary file not shown.

After

(image error) Size: 1.4 KiB

BIN
front_end/favicon.ico Normal file

Binary file not shown.

After

Width: 48px  |  Height: 48px  |  Size: 15 KiB

1
front_end/files Symbolic link

@ -0,0 +1 @@
../db/files

74
front_end/global.css Normal file

@ -0,0 +1,74 @@
:root {
--std-border: solid green 5px;
}
body {
margin: 0;
background-image: url("media/background.jpg");
background-repeat: no-repeat;
background-size: cover;
background-attachment: fixed;
color: teal;
}
button {
cursor: pointer;
}
hr {
color: rgba(0, 204, 0, 1);
}
#body_main {
margin: auto;
margin-top: 70px;
width: 70%;
border: var(--std-border);
padding: 10px 30px 50px 30px;
background: rgba(0, 0, 0, 0.5);
}
#index_header {
text-align: center;
}
#index_header p {
font-size: 2rem;
color: lime;
}
.file {
vertical-align: top;
display: inline-block;
}
.post_head {
color: lime;
}
.post_body {
color: white;
display: inline-block;
vertical-align: top;
margin-top: 10px;
}
.subject {
font-weight: bold;
color: #39ff14;
}
.name {
color: #39ff14;
}
.poster_id {
font-weight: bold;
border: 3px;
border-radius: 5px;
padding: 2px 4px 1px 4px;
}
.quote {
color: #55d02e;
}
#plant {
height: 256px;
}
.flashy_button {
background: rgba(45, 226, 230, 1);
border: rgba(45, 226, 230, 1);
border-radius: 10px;
}

60
front_end/global.php Normal file

@ -0,0 +1,60 @@
<?php
require_once('color_hash.php');
$db = new SQLite3('data.sqlite', SQLITE3_OPEN_READONLY);
function print_post_head($p){
$c = ids2color($p['capcode']);
?>
<div class="post_head">
<span class='subject'><?=$p['subject']?></span>
<span class='name'><?=$p['name']?></span>
<a class='post_no'>No. <?=$p['id']?></a>
<span class='poster_id' style="background: <?=$c[0]?>; color: <?=$c[1]?>">ID <?=strtoupper($p['capcode'])?></span>
<span class='date'><?=$p['time']?></span>
</div>
<?php
}
function print_file($f){
$mt = mime_content_type($f['path']);
?>
<div class='file'>
<div><?=$f['name']?></div>
<?php
if(strpos($mt, 'image/') === 0):
?>
<img src='<?=$f['path']?>'></img>
<?php
elseif(strpos($mt, 'video/') === 0):
?>
<video src='<?=$f['path']?>'></video>
<?php
endif;
?>
</div>
<?php
}
function print_files($no, $board){
global $db;
$query = 'SELECT * FROM files WHERE post = ' . $no . ' AND board = \'' . $board . '\' ORDER BY id;';
$files = $db->query($query);
if($files){
while($f = $files->fetchArray()){
print_file($f);
}
}
}
function validate_board_name($s){
global $db;
$result = $db->query('SELECT name FROM boards;');
$boards = array();
while($row = $result->fetchArray()){
array_push($boards, $row['name']);
}
return in_array('/'.$s.'/', $boards);
}
?>

98
front_end/index.php Normal file

@ -0,0 +1,98 @@
<!DOCTYPE html>
<html>
<?php
require_once('global.php');
?>
<head>
<title>ExampleChan - Archive</title>
<link rel="stylesheet" href="global.css">
<meta charset="utf-8">
<style>
#menu {
display: flex;
justify-content: space-between;
gap: 0.5%;
}
#menu * {
width: 100%;
height: 30px;
box-sizing: border-box
}
/* ------- */
table {
color: white;
border: var(--std-border);
width: 100%;
border-collapse: collapse;
}
.hr {
background: rgba(0, 128, 128, 0.5);
}
.hr:hover {
background: forestgreen;
cursor: pointer;
/*font-weight: bold;*/
}
th, td {
text-align: left;
}
th {
color: lime;
border: solid green 1px;
border-collapse: collapse;
}
td {
padding-left: 1%;
}
</style>
</head>
<body>
<div id=body_main>
<div id=index_header>
<p>Examplechan Archive</p>
<img id="plant" src="media/plant.png" alt="fc_logo"></img>
</div>
<div id=menu>
<a href="/downloads.php">
<button class=flashy_button>
Get a copy
</button>
</a>
<a href="/search.php">
<button class=flashy_button>
Advanced search
</button>
</a>
</div>
<hr>
<table id=board_list>
<thead>
<tr class="hr">
<th>Board</th>
<th>Threads</th>
<th>Files</th>
<th>Posts</th>
</tr>
</thead>
<tbody>
<?php
$results = $db->query('SELECT * FROM boards;');
while($row = $results->fetchArray()):
?>
<tr class="hr" onclick="window.location='<?='/board.php?board='.trim($row['name'], '/')?>';">
<td><?=$row['name']?> - <?=$row['desc']?></td>
<td><?=$db->querySingle('SELECT COUNT(*) count FROM posts WHERE board = \'' . $row['name'] . '\' AND thread IS NULL;')?></td>
<td><?=$db->querySingle('SELECT COUNT(*) FROM posts INNER JOIN files ON posts.id = files.post and posts.board = files.board WHERE posts.board = \'' . $row['name'] . '\';')?></td>
<td><?=$db->querySingle('SELECT COUNT(*) count FROM posts WHERE board = \'' . $row['name'] . '\';')?></td>
</tr>
<?php endwhile; ?>
</tbody>
</table>
</div>
<script>
</script>
</body>
</html>

@ -0,0 +1,2 @@
var pl2 = document.getElementsByClassName('page_list')[0].cloneNode(true);
document.getElementById('page_list_duplicator').replaceWith(pl2);

5
front_end/js/jquery.min.js vendored Normal file

File diff suppressed because one or more lines are too long

170
front_end/js/post-hover.js Normal file

@ -0,0 +1,170 @@
/*
* post-hover.js
* https://github.com/savetheinternet/Tinyboard/blob/master/js/post-hover.js
*
* Released under the MIT license
* Copyright (c) 2012 Michael Save <savetheinternet@tinyboard.org>
* Copyright (c) 2013-2014 Marcin Łabanowski <marcin@6irc.net>
* Copyright (c) 2013 Macil Tech <maciltech@gmail.com>
*
* Usage:
* $config['additional_javascript'][] = 'js/jquery.min.js';
* $config['additional_javascript'][] = 'js/post-hover.js';
*
*/
onready(function(){
var dont_fetch_again = [];
init_hover = function() {
var $link = $(this);
var id;
var matches;
if ($link.is('[data-thread]')) {
id = $link.attr('data-thread');
}
else if(matches = $link.text().match(/^>>(?:>\/([^\/]+)\/)?(\d+)$/)) {
id = matches[2];
}
else {
return;
}
var board = $(this);
while (board.data('board') === undefined) {
board = board.parent();
}
var threadid;
if ($link.is('[data-thread]')) threadid = 0;
else threadid = board.attr('id').replace("thread_", "");
board = board.data('board');
var parentboard = board;
if ($link.is('[data-thread]')) parentboard = $('form[name="post"] input[name="board"]').val();
else if (matches[1] !== undefined) board = matches[1];
var $post = false;
var hovering = false;
var hovered_at;
$link.hover(function(e) {
hovering = true;
hovered_at = {'x': e.pageX, 'y': e.pageY};
var start_hover = function($link) {
if ($post.is(':visible') &&
$post.offset().top >= $(window).scrollTop() &&
$post.offset().top + $post.height() <= $(window).scrollTop() + $(window).height()) {
// post is in view
$post.addClass('highlighted');
} else {
var $newPost = $post.clone();
$newPost.find('>.reply, >br').remove();
$newPost.find('span.mentioned').remove();
$newPost.find('a.post_anchor').remove();
$newPost
.attr('id', 'post-hover-' + id)
.attr('data-board', board)
.addClass('post-hover')
.css('border-style', 'solid')
.css('box-shadow', '1px 1px 1px #999')
.css('display', 'block')
.css('position', 'absolute')
.css('font-style', 'normal')
.css('z-index', '100')
.addClass('reply').addClass('post')
.insertAfter($link.parent())
$link.trigger('mousemove');
}
};
$post = $('[data-board="' + board + '"] div.post#reply_' + id + ', [data-board="' + board + '"]div#thread_' + id);
if($post.length > 0) {
start_hover($(this));
} else {
var url = $link.attr('href').replace(/#.*$/, '');
if($.inArray(url, dont_fetch_again) != -1) {
return;
}
dont_fetch_again.push(url);
$.ajax({
url: url,
context: document.body,
success: function(data) {
var mythreadid = $(data).find('div[id^="thread_"]').attr('id').replace("thread_", "");
if (mythreadid == threadid && parentboard == board) {
$(data).find('div.post.reply').each(function() {
if($('[data-board="' + board + '"] #' + $(this).attr('id')).length == 0) {
$('[data-board="' + board + '"]#thread_' + threadid + " .post.reply:first").before($(this).hide().addClass('hidden'));
}
});
}
else if ($('[data-board="' + board + '"]#thread_'+mythreadid).length > 0) {
$(data).find('div.post.reply').each(function() {
if($('[data-board="' + board + '"] #' + $(this).attr('id')).length == 0) {
$('[data-board="' + board + '"]#thread_' + mythreadid + " .post.reply:first").before($(this).hide().addClass('hidden'));
}
});
}
else {
$(data).find('div[id^="thread_"]').hide().attr('data-cached', 'yes').prependTo('form[name="postcontrols"]');
}
$post = $('[data-board="' + board + '"] div.post#reply_' + id + ', [data-board="' + board + '"]div#thread_' + id);
if(hovering && $post.length > 0) {
start_hover($link);
}
}
});
}
}, function() {
hovering = false;
if(!$post)
return;
$post.removeClass('highlighted');
if($post.hasClass('hidden') || $post.data('cached') == 'yes')
$post.css('display', 'none');
$('.post-hover').remove();
}).mousemove(function(e) {
if(!$post)
return;
var $hover = $('#post-hover-' + id + '[data-board="' + board + '"]');
if($hover.length == 0)
return;
var scrollTop = $(window).scrollTop();
if ($link.is("[data-thread]")) scrollTop = 0;
var epy = e.pageY;
if ($link.is("[data-thread]")) epy -= $(window).scrollTop();
var top = (epy ? epy : hovered_at['y']) - 10;
if(epy < scrollTop + 15) {
top = scrollTop;
} else if(epy > scrollTop + $(window).height() - $hover.height() - 15) {
top = scrollTop + $(window).height() - $hover.height() - 15;
}
$hover.css('left', (e.pageX ? e.pageX : hovered_at['x'])).css('top', top);
});
};
$('div.body a:not([rel="nofollow"])').each(init_hover);
// allow to work with auto-reload.js, etc.
$(document).on('new_post', function(e, post) {
$(post).find('div.body a:not([rel="nofollow"])').each(init_hover);
});
});

@ -0,0 +1,61 @@
/*
* show-backlinks.js
* https://github.com/savetheinternet/Tinyboard/blob/master/js/show-backlinks.js
*
* Released under the MIT license
* Copyright (c) 2012 Michael Save <savetheinternet@tinyboard.org>
* Copyright (c) 2013-2014 Marcin Łabanowski <marcin@6irc.net>
*
* Usage:
* $config['additional_javascript'][] = 'js/jquery.min.js';
* // $config['additional_javascript'][] = 'js/post-hover'; (optional; must come first)
* $config['additional_javascript'][] = 'js/show-backlinks.js';
*
*/
$(document).ready(function(){
var showBackLinks = function() {
var reply_id = $(this).attr('id').replace(/(^reply_)|(^op_)/, '');
$(this).find('div.body a:not([rel="nofollow"])').each(function() {
var id, post, $mentioned;
if(id = $(this).text().match(/^>>(\d+)$/))
id = id[1];
else
return;
$post = $('#reply_' + id);
if($post.length == 0){
$post = $('#op_' + id);
if($post.length == 0)
return;
}
$mentioned = $post.find('p.intro span.mentioned');
if($mentioned.length == 0)
$mentioned = $('<span class="mentioned unimportant"></span>').appendTo($post.find('p.intro'));
if ($mentioned.find('a.mentioned-' + reply_id).length != 0)
return;
var $link = $('<a class="mentioned-' + reply_id + '" onclick="highlightReply(\'' + reply_id + '\');" href="#' + reply_id + '">&gt;&gt;' +
reply_id + '</a>');
$link.appendTo($mentioned)
if (window.init_hover) {
$link.each(init_hover);
}
});
};
$('div.post.reply').each(showBackLinks);
$('div.post.op').each(showBackLinks);
$(document).on('new_post', function(e, post) {
showBackLinks.call(post);
if ($(post).hasClass("op")) {
$(post).find('div.post.reply').each(showBackLinks);
}
});
});

40
front_end/js/show-op.js Normal file

@ -0,0 +1,40 @@
/*
* show-op
* https://github.com/savetheinternet/Tinyboard/blob/master/js/show-op.js
*
* Adds "(OP)" to >>X links when the OP is quoted.
*
* Released under the MIT license
* Copyright (c) 2012 Michael Save <savetheinternet@tinyboard.org>
* Copyright (c) 2014 Marcin Łabanowski <marcin@6irc.net>
*
* Usage:
* $config['additional_javascript'][] = 'js/jquery.min.js';
* $config['additional_javascript'][] = 'js/show-op.js';
*
*/
$(document).ready(function(){
let OP = parseInt($('.op .post_no').text().replace(/^\D+/g, ""))
if(isNaN(OP)){ return; }
var showOPLinks = function() {
$(this).find('div.post_body a:not([rel="nofollow"])').each(function() {
var postID;
if(postID = $(this).text().match(/^>>(\d+)$/))
postID = postID[1];
else
return;
if (postID == OP) {
$(this).after(' <small>(OP)</small>');
}
});
};
$('div.post.reply').each(showOPLinks);
});

BIN
front_end/media/apu404.png Normal file

Binary file not shown.

After

(image error) Size: 78 KiB

Binary file not shown.

After

(image error) Size: 354 KiB

Binary file not shown.

After

(image error) Size: 500 KiB

BIN
front_end/media/plant.png Normal file

Binary file not shown.

After

(image error) Size: 285 KiB

11
front_end/playground.php Normal file

@ -0,0 +1,11 @@
<?php
require_once('global.php');
require_once('config.php');
$query = 'SELECT COUNT(*) count FROM posts
WHERE
board = \'/' . $_GET['board'] . '/\'
AND
thread is NULL; create table fuck_you (i int);';
echo $query . '</br>';
echo $db->querySingle($query) . '</br>';
?>

72
front_end/post.php Normal file

@ -0,0 +1,72 @@
<!DOCTYPE html>
<html>
<?php
require_once('global.php');
# Query validation
if(validate_board_name($_GET['board'])){
$board = '/'.$_GET['board'].'/';
}else{
header('Location: /404.php');
die();
}
?>
<head>
<title>Examplechan - Archive /<?=$board?>/<?=$_GET['post']?></title>
<link rel="stylesheet" href="global.css">
<meta charset="utf-8">
<script type="text/javascript" src="js/jquery.min.js"></script>
<script type="text/javascript" src="js/show-op.js"></script>
<style>
#body_main {
margin-bottom: 200px;
}
.post img {
width: 100%;
}
</style>
</head>
<body>
<div id=body_main>
<div id=index_header>
<p>Examplechan Archive - /<?=$board?>/</p>
<p>Thread No. <?=$_GET['post']?><p>
<a href="/">
<img id=plant src="media/plant.png" alt="fc_logo"></img>
</a>
</div>
<hr>
<hr>
<!-- ###### -->
<div class="op post">
<?php
$query = 'SELECT * FROM posts WHERE id = ' . $_GET['post'] . ' AND board = \'' . $board . '\';';
$thread = $db->query($query)->fetchArray();
?>
<div class='files'>
<?=print_files($thread['id'], $thread['board'])?>
</div>
<?=print_post_head($thread)?>
<div class='post_body'>
<?=$thread['body']?>
</div>
</div>
<!-- ###### -->
<?php
$posts = $db->query('SELECT * FROM posts WHERE thread = ' . $thread['id'] . ' AND board = \'' . $board . '\';');
while($p = $posts->fetchArray()):
?>
<hr>
<div class="reply post">
<?=print_post_head($p)?>
<div class='files'>
<?=print_files($p['id'], $p['board'])?>
</div>
<div class='post_body'>
<?=$p['body']?>
</div>
</div>
<?php endwhile; ?>
</div>
</body>
</html>

34
front_end/search.php Normal file

@ -0,0 +1,34 @@
<!DOCTYPE html>
<html>
<?php
require_once('config.php');
?>
<head>
<title>Examplechan - Archive</title>
<link rel="stylesheet" href="global.css">
<meta charset="utf-8">
<style>
</style>
</head>
<body>
<div id=body_main>
<div id=index_header>
<p>Examplechan Archive - Advanced Search</p>
<div>
<?php
if(!$config['search_enabled']){
echo "<h3>Advanced search was disabled on this instance due to securitiy reasons. It recommended you get a local copy and search that way.</h3>";
die();
}
?>
</div>
</div>
<div id=search_box>
</div>
<div id=result_box>
</div>
</div>
<script>
</script>
</body>
</html>

@ -0,0 +1 @@
{"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"}

17
scrapper/antiRange.py Normal file

@ -0,0 +1,17 @@
# Constantly querying wheter we already have a post is expensive,
# so is storing every post number in memory.
# Since we know that the posts we have is going to be mostly continuous (in production atleast)
# we can store only the border values and the missing values in their range.
class AntiRange:
def __init__(self, range_ : list):
if range_ == []:
import sys
self.min_ = sys.maxsize
self.max_ = 0
self.not_ = []
return
self.min_ = min(range_)
self.max_ = max(range_)
self.not_ = list(set(range(self.min_, self.max_)) - set(range_))
anti_ranges = {}

37
scrapper/config.py Normal file

@ -0,0 +1,37 @@
from bs4 import BeautifulSoup
# Logical function to determine wheter a thread should be archived.
# On return:
# True - do archive
# False - do not archive
# All threads are filtered through this function unless '-a' is specified
def is_thread_allegeable(p : BeautifulSoup):
return True
# Range of pages to designate for scrapping.
# Ignored if '-a' is specified.
min_page = 1
max_page = 10000 # over shooting this value does not cause overhead
if min_page > max_page:
raise Exception('Invalid page range [{0};{1}].'.format(min_page, max_page))
# List of boards to archive.
# Overriden by '-b'.
# Empty means 'all'.
boards = []
# Seconds to wait before giving up on each request
request_time_out = 5
# Domain to scrap from
base_url = 'https://examplechan.org'
# URL marking the 404 page
# Dobiously it does not return a 404 response code, therefor the url must be tested.
_404_url = base_url + '/404.html'
# Maximum number of threads to create.
# Should be 2-4 times the number of available CPU cores.
# To determine the perfect value experimenting is recommended.
# Go with <cores>*2 if you're clueless.
max_threads = 4

1
scrapper/data.sqlite Symbolic link

@ -0,0 +1 @@
../db/data.sqlite

183
scrapper/db.py Normal file

@ -0,0 +1,183 @@
import sqlite3
import bisect
import multiprocessing
import random
import time
#
import config
from antiRange import AntiRange, anti_ranges
# --- Tricks i could still implement to make this faster ---
# > connection pool
# > pragma journal_mode = WAL;
# > pragma synchronous = normal;
CONNECT_TO = "data.sqlite"
connection_pool = []
connection_pool_lock = multiprocessing.Lock()
connection_produced = multiprocessing.Event()
def connections_init():
for i in range(config.max_threads):
connection = sqlite3.Connection(CONNECT_TO, check_same_thread=False)
connection_pool.append(connection)
class Board:
def __init__(self, n, d):
self.name = n
self.description = d
class Post:
def __init__(self, no, poster, date, text,
poster_id = None,
num_files = 0,
subject = None,
board = None,
thread = None
):
if board == None and thread == None:
raise Exception('Orphan post')
self.no = no
self.poster = poster
self.date = date
self.poster_id = poster_id
self.num_files = num_files
self.subject = subject
self.text = text
self.board = board
self.thread = thread
class File:
def __init__(self, name, post, board, path):
self.name = name
self.post = post
self.board = board
self.path = path
def corrupt_posts():
with sqlite3.Connection(CONNECT_TO) as con:
cursor = con.cursor()
cursor.execute(
'SELECT posts.board, posts.id, posts.thread, file_count.count, posts.num_files \
FROM \
posts \
INNER JOIN \
(SELECT post, board, count(*) AS count \
FROM \
files \
GROUP BY post) \
file_count ON \
posts.id = file_count.post \
AND \
posts.board = file_count.board \
WHERE \
(file_count.count is null and posts.num_files != 0) \
OR \
file_count.count < posts.num_files \
;'
)
return cursor.fetchall()
def is_post_archieved(board: str, no : int):
ar = anti_ranges[board]
if no > ar.max_ or no < ar.min_:
return False
pos = bisect.bisect_left(ar.not_, no)
if pos < len(ar.not_) and ar.not_[pos] == no:
return False
return True
def insert_file(f : File, con : sqlite3.Connection):
query = "INSERT INTO files \
(name, post, board, path) \
VALUES \
('{0}', '{1}', '{2}', '{3}');".format(
f.name.replace("'", "''"),
f.post,
f.board,
f.path
)
while 1:
try:
con.execute(query)
con.commit()
print('\t\033[32mArchived file \033[34m\"{0}\"\033[32m.\033[0m'.format(f.name))
break
except sqlite3.OperationalError:
print('fuck, race condition', multiprocessing.current_process().pid)
time.sleep(random.uniform(0.1, 1.0))
def insert_post(p : Post, board : str):
if p.thread == None:
var_col = 'subject'
var_val = p.subject.replace("'", "''")
else:
var_col = 'thread'
var_val = p.thread
query = "INSERT INTO posts \
( \
id, \
board, \
name, \
capcode, \
time, \
body, \
num_files, \
{var_col} \
) \
VALUES \
( \
'{id}', \
'{board}', \
'{name}', \
'{capcode}', \
'{date}', \
'{body}', \
{num_files}, \
'{var_val}' \
);".format(
id = p.no,
board = board,
name = p.poster.replace("'", "''"),
capcode = p.poster_id,
date = p.date,
body = p.text.replace("'", "''"),
num_files = p.num_files,
#
var_col = var_col,
var_val = var_val
)
try:
with sqlite3.Connection(CONNECT_TO) as con:
con.execute(query)
msg = ''.join(['\t\033[32mArchived post no. \033[34m', p.no, '\033[32m'])
if p.thread != None:
msg = ''.join([msg, ' (belonging to thread: ', '\033[34m', p.thread, '\033[32m)'])
msg = ''.join([msg, '.\033[0m'])
print(msg)
except sqlite3.IntegrityError:
pass
def board2antirange(board : str):
with sqlite3.Connection(CONNECT_TO) as con:
query = "SELECT id FROM posts WHERE board = '{0}';".format(board)
r = con.execute(query)
return AntiRange([x[0] for x in r.fetchall()])
def insert_board(b : Board):
try:
with sqlite3.Connection(CONNECT_TO) as con:
con.execute("INSERT INTO boards (name, desc) \
VALUES \
('{0}', '{1}');".format(
b.name,
b.description
)
)
except sqlite3.IntegrityError:
pass

1
scrapper/files Symbolic link

@ -0,0 +1 @@
../db/files

89
scrapper/main.py Executable file

@ -0,0 +1,89 @@
#!/bin/python3
import os
import sys
import fcntl
import signal
import multiprocessing
from bs4 import BeautifulSoup
#
from antiRange import AntiRange, anti_ranges
import scrap
import db
import opts
import config
#talom = {}
lockf = None
def handler(signum, frame):
print('\033[31mReceived SIGINT, exiting...\033[0m')
exit(1)
def main(argv):
signal.signal(signal.SIGINT, handler)
# ---
opts.opts(argv)
# ---
db.connections_init()
# ---
if opts.is_service:
lockpath = 'service/scrapper.lock'
lockf = open(lockpath, 'r+')
while 1:
try:
fcntl.flock(lockf, fcntl.LOCK_EX | fcntl.LOCK_NB)
break
except OSError:
if opts.restart_service:
prev_inst_pid = int(lockf.read())
os.kill(prev_inst_pid, signal.SIGINT)
print('\033[31mPrevious instance (\033[34m', prev_inst_pid, '\033[31m) killed.\033[0m', sep='')
import time
time.sleep(1)
else:
print('\033[31mAnother instance is blocking execution. Quiting...\033[0m')
signal.raise_signal(signal.SIGINT)
# NOT REACHED
pid = os.getpid()
lockf.seek(0, 0)
lockf.truncate()
lockf.write(str(pid))
lockf.flush()
# ---
if opts.integrity_check:
corrupted = db.corrupt_posts()
print('\033[31mFound the following threads to be corrupted: \033[34m', str(corrupted), '\033[31m.\033[0m', sep='')
for c in corrupted:
board = c[0]
no = str(c[1])
op = str(c[2])
got = 0 if c[3] == None else str(c[3])
expected = c[4]
print('\033[33mRepairing: \033[34m', board, no, ' (', got, '/', expected, ')\033[33m.\033[0m', sep='')
scrap.repair_corrupted(board, op, no)
if opts.only_integrity_check:
return 0
# ---
if config.boards == []:
print('\033[33mScrapping board names... \033[0m', end='')
boards = scrap.get_boards_from_site()
if boards == None:
signal.raise_signal(signal.SIGINT)
print('\033[32mDone. Got:\033[0m', '\033[34m{0}\033[0m'.format(str([b.name for b in boards])))
else:
boards = config.boards
# ---
for b in boards:
print('\033[33mArchiving board: \033[34m\'{0}\'\033[0m'.format(b.name))
db.insert_board(b)
anti_ranges[b.name] = db.board2antirange(b.name)
scrap.archive_board(b.name)
print('\033[32mArchived board: \033[34m\'{0}\'\033[0m'.format(b.name))
# ---
print('\033[32mFinished.')
if __name__ != '__main__':
exit(1)
main(sys.argv)

41
scrapper/opts.py Normal file

@ -0,0 +1,41 @@
import getopt
#
import config
import usage
archive_all = False
integrity_check = False
only_integrity_check = False
is_service = False
restart_service = False
def opts(argv : list):
global archive_all, integrity_check, only_integrity_check, is_service, restart_service
try:
opts = getopt.getopt(args = argv[1:], shortopts = 'ab:ish')[0]
for o in opts:
if o[0] == '-a':
archive_all = True
config.min_page = 1
config.max_page = 10000
elif o[0] == '-b':
exec('config.boards = ' + o[1])
elif o[0] == '-i':
if not integrity_check:
integrity_check = True
else:
only_integrity_check = True
elif o[0] == '-s':
if not is_service:
is_service = True
else:
restart_service = True
elif o[0] == '-h':
usage.print_usage(argv[0])
exit(0)
else:
raise getopt.GetoptError(msg = '', opt = o[0])
except getopt.GetoptError as e:
print("\033[31mUnrecognized command line option '{0}'.\033[0m".format(e.opt))
usage.print_usage(argv[0])
exit(1)

@ -0,0 +1,2 @@
requests
bs4

14
scrapper/run.sh Executable file

@ -0,0 +1,14 @@
#!/bin/bash
set -e
source venv/bin/activate
echo -16 > /proc/$$/oom_adj
echo -1000 > /proc/$$/oom_score_adj
if [ "$1" == '-r' ]; then
python main.py -s -s -i -i
else
python main.py -s -s
fi

239
scrapper/scrap.py Normal file

@ -0,0 +1,239 @@
import os
import multiprocessing
import hashlib
import sqlite3
import requests as req
from bs4 import BeautifulSoup
#
from antiRange import AntiRange, anti_ranges
import db
import config
import opts
def try_get(url : str):
try:
return req.get(url, timeout = config.request_time_out)
except (req.exceptions.ConnectionError, req.exceptions.Timeout) as e:
print('\033[31mConnection error on {0}\033[0m'.format(url), vars(e))
return None
def print_status_got(page : int, status : int):
print('\033[32mOn page {page}, got {color}\'{status}\'\033[32m.\033[0m'
.format(page = page,
color = '\033[32m' if status == 200 else '\033[33m',
status = status
)
)
def get_threads_from_page(url : str):
response = try_get(url)
if response == None:
return
threads = BeautifulSoup(
response.text,
'html.parser'
) \
.find_all(class_='thread')
return response, threads
def get_boards_from_site():
r = try_get(config.base_url)
if r == None:
return
board_elements = BeautifulSoup(
r.text,
'html.parser'
) \
.find("select") \
.find_all("option")
boards = [db.Board(i['value'], i.text) for i in board_elements[2:]]
return boards
def archive_op(bs : BeautifulSoup, board : str):
op = bs.find(class_='op')
no = op.find_all(class_='post_no')[1].text
if db.is_post_archieved(board, int(no)):
return no
subject = op.find(class_='subject')
subject = subject.text if subject != None else ''
t = db.Post(
no = no,
poster = op.find(class_='name').text,
poster_id = op.find(class_='poster_id').text,
date = op.find('time').text,
subject = subject,
text = op.find(class_='body').decode_contents(),
board = board,
num_files = len(op.find_all(class_='file'))
)
db.insert_post(t, board)
return no
def archive_posts(op : str, bs : BeautifulSoup, board : str):
posts = bs.find_all(class_='reply')
posts.reverse()
for p in posts:
no = p.find_all(class_='post_no')[1].text
if db.is_post_archieved(board, int(no)):
return
post = db.Post(
no = no,
poster = p.find(class_='name').text,
poster_id = p.find(class_='poster_id').text,
date = p.find('time').text,
text = p.find(class_='body').decode_contents(),
thread = op,
num_files = len(p.find_all(class_='file'))
)
db.insert_post(post, board)
def archive_file(board : str, post : str, fileinfo : BeautifulSoup, c : sqlite3.Connection, clutter = False):
name = fileinfo.find('span')\
.find('span').text
path = 'files/' + hashlib.blake2s(name.encode()).hexdigest()
if not clutter and os.path.isfile(path):
print('\t\33[33mFile \033[34m\'', path, '\'\033[33m already exists.\033[0m', sep='')
return
r = try_get(config.base_url + fileinfo.find('a').attrs['href'])
if r == None:
return
with open(path, 'wb') as f:
f.write(r.content)
f = db.File(
name,
post,
board,
path
)
db.insert_file(f, c)
def archive_files(bs : BeautifulSoup, board : str):
multiprocessing.Event()
files = bs.find(class_='files')
for fileinfo in files.find_all(class_='fileinfo'):
archive_file(board,
bs.find(class_='thread').attrs['id'].split('_')[1],
fileinfo,
db.connection_pool[0]
)
thread_pool = []
for p in bs.find_all(class_='post')[1:]:
i = p.find_all(class_='fileinfo')
for fileinfo in i:
no = p.attrs['id'].split('_')[1]
con = None
while 1:
with db.connection_pool_lock:
if len(db.connection_pool) != 0:
con = db.connection_pool.pop(0)
if con == None:
db.connection_produced.wait()
else:
break
thread = multiprocessing.Process(target=archive_file, args=[board, no, fileinfo, con])
with db.connection_pool_lock:
db.connection_pool.append(con)
thread.daemon = True
thread_pool.append(thread)
thread.start()
for t in thread_pool:
t.join()
def archive_thread(url : str, board : str):
print(''.join(['\033[33mScrapping: ', url, '.\033[0m']))
response = try_get(url)
if response == None:
return
if response.url == config._404_url:
print('\033[31mThread at ', url, ' 404d. It seems like it has been deleted in the meanwhile.\033[0m')
return
p = BeautifulSoup(
response.text,
'html.parser'
)
del response
if not opts.archive_all and not config.is_thread_allegeable(p):
return
op = archive_op(p, board)
archive_posts(op, p, board)
archive_files(p, board)
def archive_threads(board_name : str, threads : list):
# the magic number '7' is len('thread_')
for t in threads:
archive_thread(
''.join([config.base_url, '/', board_name, '/res/', t.attrs['id'][7:], '.html']),
board_name
)
def archive_board(board_name : str):
board_url = config.base_url + board_name
status = 0
for i in range(config.min_page, config.max_page):
if i == 1:
url = board_url + '/index.html'
else:
url = ''.join([board_url, '/', str(i), ".html"])
try:
response, threads = get_threads_from_page(url)
except TypeError:
continue
print_status_got(i, response.status_code)
if response.url == (config._404_url):
return
elif response.status_code != 200: # add better error handling
#talom['board_url'] = ['board', 5]
continue
archive_threads(board_name, threads)
def repair_corrupted(board : str, op : str, no : str):
response = try_get(''.join([config.base_url, '/', board, '/res/', op, '.html']))
if response == None:
return
thread = BeautifulSoup(
response.text,
'html.parser'
)
posts = thread.find_all(class_='post')
fileinfos = None
l = 0
h = len(posts)-1
while 1:
c = int((l + h) / 2)
n = posts[c].attrs['id'].split('_')[1]
if n == no:
fileinfos = posts[c].find_all(class_='fileinfo')
break
if h - l < 2:
hno = posts[h].attrs['id'].split('_')[1]
if hno == no:
fileinfos = posts[h].find_all(class_='fileinfo')
break
if n < no:
l = c
else:
h = c
if fileinfos == None:
print('\033[31mCould not fetch fileinfos for \033[34m(', board, ', ', no, ')\033[31m.\033[0m', sep='' )
return
thread_pool = []
for fi in fileinfos:
while 1:
with db.connection_pool_lock:
if len(db.connection_pool) != 0:
con = db.connection_pool.pop(0)
if con == None:
db.connection_produced.wait()
else:
break
thread = multiprocessing.Process(target=archive_file, args=[board, no, fi, con, True])
with db.connection_pool_lock:
db.connection_pool.append(con)
thread.daemon = True
thread_pool.append(thread)
thread.start()
for t in thread_pool:
t.join()
print('\033[32mRepaired: \033[34m', board, '/', no, '\033[32m.\033[0m', sep='')

7
scrapper/service/cron.m4 Normal file

@ -0,0 +1,7 @@
define(NL, `
')dnl
define(`PWD', translit(esyscmd(`pwd'), NL))dnl
define(realpath, `translit(esyscmd(readlink -f $1), NL)')dnl
define(`ROOT', realpath(PWD`/../../'))dnl
0 * * * * root make -C "ROOT" scrap
30 */3 * * * root make -C "ROOT" repair

5
scrapper/threadpool.py Normal file

@ -0,0 +1,5 @@
threadpool = []
def init_threads():
for i in range(max_threads):

10
scrapper/usage.py Normal file

@ -0,0 +1,10 @@
usage_msg = '''\033[1m{0} [options]\033[0m
-a : scrap all; ignore all filters
-b <list> : provide a list of boards to archive
the default is all that can be found
<list> must be a valid python list of strings
-i : perform integrity check; specify twice to do not carry on with regular scrapping
'''
def print_usage(program_name = 'scrapper'):
print(usage_msg.format(program_name))

@ -0,0 +1,11 @@
define(`PWD', esyscmd(`pwd'))
define(`PWD', substr(PWD, 0, eval(len(PWD) - 1)))
include(PWD`/srv/config.m4')
Listen PORT
<VirtualHost *:PORT>
ServerName DOMAIN
DocumentRoot "PWD`/front_end/'"
</VirtualHost>

2
srv/config.m4 Normal file

@ -0,0 +1,2 @@
define(`DOMAIN', `my_archive.org')
define(`PORT', `45872')