1
0
mirror of https://github.com/tiyn/stud.ip-crawler.git synced 2025-10-24 16:51:18 +02:00

database: switching from mysql to sqlite

Currently we only need the db to keep track of the files that were
already downloaded, thus we only use one database and only one table.
A complete sql database is a bit too bulky for this purpose.
By using sqlite we can minimize and embed the db.
This commit is contained in:
TiynGER
2020-11-17 11:04:03 +01:00
parent 415a21da3b
commit 4a8cf45ad3
7 changed files with 29 additions and 71 deletions

View File

@@ -56,21 +56,20 @@ Run `python3 run.py -h` for a help menu and see which ones are important for you
#### Environment-variables
Set the following variables with the -e tag.
| Name | Usage | Default |
| ---- |------ | ------- |
| ---------- | ----------------------------- | ------- |
| `USER` | username on the studip server | `admin` |
| `PSWD` | password on the studip server | `admin` |
| `URL` | url of the studip server | `admin` |
| `HOST` | ip of the mysql instance to connect | `mysql` |
| `DB_USER` | username of the mysql instance to connect | `root` |
| `DB_PSWD` | password of the mysql instance to connect | `root` |
| `INTERVAl` | update interval in seconds | `86400` |
#### Volumes
Set the following volumes with the -v tag.
| Volume-Name | Container mount | Description |
| ----------- | --------------- | ----------------------------------- |
| ------------- | ------------------ | ----------------------------------------- |
| `studip_data` | `/studip/src/data` | directory for studip files to be saved to |
#### docker-compose.yml

View File

@@ -8,24 +8,9 @@ services:
PSWD: 'pswd'
URL: 'https://url.tld'
INTERVAL: 86400
HOST: 'mysql'
DB_USER: root
DB_PSWD: 'pswddb'
volumes:
- studip:/studip/data
depends_on:
- mysql
mysql:
image: mysql:5.6
restart: unless-stopped
environment:
MYSQL_ROOT_PASSWORD: 'pswddb'
volumes:
- mysql:/var/lib/mysql
volumes:
studip:
driver: local
mysql:
driver: local

View File

@@ -1,3 +1,3 @@
#!/bin/bash
#!/bin/sh
while true; do python /studip/run.py -o /studip/data -u $USER -p $PSWD -s $URL --db_user $DB_USER --db_passwd $DB_PSWD --host $HOST && sleep $INTERVAL; done
while true; do python /studip/run.py -o /studip/data -u $USER -p $PSWD -s $URL && sleep $INTERVAL; done

View File

@@ -1,41 +1,30 @@
import time
import logging as log
import os
import pymysql
import pysqlite3
class Database:
def __init__(self, host, port, name, user, passwd, reset_dl):
self.HOST = host
self.PORT = port
self.NAME = name
self.USER = user
self.PASSWD = passwd
def __init__(self, reset_dl):
self.RESET_DL = reset_dl
self.TABLE_FILE = 'files'
self.DB_DIR = os.path.dirname(os.path.realpath(__file__))
self.setup_db()
def connect(self):
"""Connect to an existing database instance based on the object attributes.
"""Connect to an existing database instance based on the object
attributes.
"""
return pymysql.connect(
host=self.HOST,
port=self.PORT,
user=self.USER,
password=self.PASSWD,
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor
)
path = os.path.join(self.DB_DIR, "data.db")
return pysqlite3.connect(path)
def setup_db(self):
"""Creates a database with tables.
"""
"""Creates a database with tables."""
log.info("check database")
db = self.connect()
crs = db.cursor()
sql_query = "CREATE DATABASE IF NOT EXISTS " + self.NAME
crs.execute(sql_query)
db.select_db(self.NAME)
query = "CREATE TABLE IF NOT EXISTS " + self.TABLE_FILE + \
"(id CHAR(32) NOT NULL," + \
"ch_date INT(11) NOT NULL," + \
@@ -51,12 +40,11 @@ class Database:
time(int): time the file was downloaded
"""
db = self.connect()
db.select_db(self.NAME)
crs = db.cursor()
log.debug('file: ' + file_id + ' time: ' + time)
query = "INSERT INTO " + self.TABLE_FILE + "(`id`,`ch_date`)" + \
"VALUES ('" + file_id + "','" + time + "')" + \
"ON DUPLICATE KEY UPDATE `ch_date` = '" + time + "'"
"ON CONFLICT(`id`) DO UPDATE SET `ch_date` = '" + time + "'"
crs.execute(query)
db.commit()
@@ -72,11 +60,10 @@ class Database:
if self.RESET_DL:
return None
db = self.connect()
db.select_db(self.NAME)
crs = db.cursor()
query = "SELECT ch_date FROM files WHERE id ='" + file_id + "'"
crs.execute(query)
res = crs.fetchone()
if res != None:
return res['ch_date']
return res[0]
return None

View File

@@ -1,3 +1,2 @@
tqdm==4.46.1
requests==2.23.0
PyMySQL==0.9.3
pysqlite3==0.4.3
requests==2.24.0

View File

@@ -6,7 +6,7 @@ import logging as log
from studip import Studip
from crawler import Crawler
from mysql import Database
from database import Database
parser = argparse.ArgumentParser(description='Download Files from StudIP.')
@@ -21,14 +21,6 @@ parser.add_argument('--chunk', type=int, default=1024 *
1024, help='chunksize for downloading data')
parser.add_argument('-r', '--reset_dl_date', action='store_true',
help='downloads everything and ignores last download date')
parser.add_argument('--host', type=str, default='localhost', help='mysql host')
parser.add_argument('--port', type=int, default=3306, help='mysql port')
parser.add_argument('--db_name', type=str, default='studip',
help='mysql database name')
parser.add_argument('--db_user', type=str, default='root',
help='mysql database user')
parser.add_argument('--db_passwd', type=str,
default='secret-pw', help='mysql database password')
parser.add_argument('-d', '--debug_output', action='store_true',
help='display debug information about the process')
parser.add_argument('-q', '--quiet', action='store_true',
@@ -53,8 +45,7 @@ BASE_DIR = os.path.abspath(args.output)
USERNAME = args.user
PASSWORD = args.passwd
db = Database(args.host, args.port, args.db_name,
args.db_user, args.db_passwd, args.reset_dl_date)
db = Database(args.reset_dl_date)
studip = Studip(args.chunk, args.url, (USERNAME, PASSWORD), db)

View File

@@ -146,13 +146,10 @@ class Studip:
last_dl = self.db.get_last_file_dl(doc)
if last_dl == None or last_dl < doc_chdate:
rsp2 = self.auth_req('/api.php/file/' + doc + '/download')
#total_size = int(rsp2.headers.get('content-length', 0))
log.info('downloading ' + doc_name)
#progbar = tqdm(total=total_size, unit='iB', unit_scale=True)
try:
with open(doc_name, 'wb') as doc_file:
for chunk in rsp2.iter_content(self.CHUNK_SIZE):
#progbar.update(len(chunk))
doc_file.write(chunk)
self.db.set_last_file_dl(str(doc), str(int(time.time())))
except OSError:
@@ -172,7 +169,7 @@ class Studip:
try:
subdirs = rsp.json()['subfolders']
except ValueError:
return res_docs
return res_subdirs
for subdir in subdirs:
try:
sub_id = subdir['id']