mirror of
https://github.com/tiyn/stud.ip-crawler.git
synced 2025-04-01 15:37:47 +02:00
database: switching from mysql to sqlite
Currently we only need the db to keep track of the files that were already downloaded, thus we only use one database and only one table. A complete sql database is a bit too bulky for this purpose. By using sqlite we can minimize and embed the db.
This commit is contained in:
parent
415a21da3b
commit
4a8cf45ad3
21
README.md
21
README.md
@ -56,21 +56,20 @@ Run `python3 run.py -h` for a help menu and see which ones are important for you
|
||||
#### Environment-variables
|
||||
|
||||
Set the following variables with the -e tag.
|
||||
| Name | Usage | Default |
|
||||
| ---- |------ | ------- |
|
||||
| `USER` | username on the studip server | `admin` |
|
||||
| `PSWD` | password on the studip server | `admin` |
|
||||
| `URL` | url of the studip server | `admin` |
|
||||
| `HOST` | ip of the mysql instance to connect | `mysql` |
|
||||
| `DB_USER` | username of the mysql instance to connect | `root` |
|
||||
| `DB_PSWD` | password of the mysql instance to connect | `root` |
|
||||
| `INTERVAl` | update interval in seconds | `86400` |
|
||||
|
||||
| Name | Usage | Default |
|
||||
| ---------- | ----------------------------- | ------- |
|
||||
| `USER` | username on the studip server | `admin` |
|
||||
| `PSWD` | password on the studip server | `admin` |
|
||||
| `URL` | url of the studip server | `admin` |
|
||||
| `INTERVAl` | update interval in seconds | `86400` |
|
||||
|
||||
#### Volumes
|
||||
|
||||
Set the following volumes with the -v tag.
|
||||
| Volume-Name | Container mount | Description |
|
||||
| ----------- | --------------- | ----------------------------------- |
|
||||
|
||||
| Volume-Name | Container mount | Description |
|
||||
| ------------- | ------------------ | ----------------------------------------- |
|
||||
| `studip_data` | `/studip/src/data` | directory for studip files to be saved to |
|
||||
|
||||
#### docker-compose.yml
|
||||
|
@ -8,24 +8,9 @@ services:
|
||||
PSWD: 'pswd'
|
||||
URL: 'https://url.tld'
|
||||
INTERVAL: 86400
|
||||
HOST: 'mysql'
|
||||
DB_USER: root
|
||||
DB_PSWD: 'pswddb'
|
||||
volumes:
|
||||
- studip:/studip/data
|
||||
depends_on:
|
||||
- mysql
|
||||
|
||||
mysql:
|
||||
image: mysql:5.6
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
MYSQL_ROOT_PASSWORD: 'pswddb'
|
||||
volumes:
|
||||
- mysql:/var/lib/mysql
|
||||
|
||||
volumes:
|
||||
studip:
|
||||
driver: local
|
||||
mysql:
|
||||
driver: local
|
||||
|
@ -1,3 +1,3 @@
|
||||
#!/bin/bash
|
||||
#!/bin/sh
|
||||
|
||||
while true; do python /studip/run.py -o /studip/data -u $USER -p $PSWD -s $URL --db_user $DB_USER --db_passwd $DB_PSWD --host $HOST && sleep $INTERVAL; done
|
||||
while true; do python /studip/run.py -o /studip/data -u $USER -p $PSWD -s $URL && sleep $INTERVAL; done
|
||||
|
@ -1,41 +1,30 @@
|
||||
import time
|
||||
import logging as log
|
||||
import os
|
||||
|
||||
import pymysql
|
||||
import pysqlite3
|
||||
|
||||
|
||||
class Database:
|
||||
|
||||
def __init__(self, host, port, name, user, passwd, reset_dl):
|
||||
self.HOST = host
|
||||
self.PORT = port
|
||||
self.NAME = name
|
||||
self.USER = user
|
||||
self.PASSWD = passwd
|
||||
def __init__(self, reset_dl):
|
||||
self.RESET_DL = reset_dl
|
||||
self.TABLE_FILE = 'files'
|
||||
self.DB_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||
self.setup_db()
|
||||
|
||||
def connect(self):
|
||||
"""Connect to an existing database instance based on the object attributes.
|
||||
"""Connect to an existing database instance based on the object
|
||||
attributes.
|
||||
"""
|
||||
return pymysql.connect(
|
||||
host=self.HOST,
|
||||
port=self.PORT,
|
||||
user=self.USER,
|
||||
password=self.PASSWD,
|
||||
charset='utf8mb4',
|
||||
cursorclass=pymysql.cursors.DictCursor
|
||||
)
|
||||
path = os.path.join(self.DB_DIR, "data.db")
|
||||
return pysqlite3.connect(path)
|
||||
|
||||
def setup_db(self):
|
||||
"""Creates a database with tables.
|
||||
"""
|
||||
"""Creates a database with tables."""
|
||||
log.info("check database")
|
||||
db = self.connect()
|
||||
crs = db.cursor()
|
||||
sql_query = "CREATE DATABASE IF NOT EXISTS " + self.NAME
|
||||
crs.execute(sql_query)
|
||||
db.select_db(self.NAME)
|
||||
query = "CREATE TABLE IF NOT EXISTS " + self.TABLE_FILE + \
|
||||
"(id CHAR(32) NOT NULL," + \
|
||||
"ch_date INT(11) NOT NULL," + \
|
||||
@ -51,12 +40,11 @@ class Database:
|
||||
time(int): time the file was downloaded
|
||||
"""
|
||||
db = self.connect()
|
||||
db.select_db(self.NAME)
|
||||
crs = db.cursor()
|
||||
log.debug('file: ' + file_id + ' time: ' + time)
|
||||
query = "INSERT INTO " + self.TABLE_FILE + "(`id`,`ch_date`)" + \
|
||||
"VALUES ('" + file_id + "','" + time + "')" + \
|
||||
"ON DUPLICATE KEY UPDATE `ch_date` = '" + time + "'"
|
||||
"ON CONFLICT(`id`) DO UPDATE SET `ch_date` = '" + time + "'"
|
||||
crs.execute(query)
|
||||
db.commit()
|
||||
|
||||
@ -72,11 +60,10 @@ class Database:
|
||||
if self.RESET_DL:
|
||||
return None
|
||||
db = self.connect()
|
||||
db.select_db(self.NAME)
|
||||
crs = db.cursor()
|
||||
query = "SELECT ch_date FROM files WHERE id ='" + file_id + "'"
|
||||
crs.execute(query)
|
||||
res = crs.fetchone()
|
||||
if res != None:
|
||||
return res['ch_date']
|
||||
return res[0]
|
||||
return None
|
@ -1,3 +1,2 @@
|
||||
tqdm==4.46.1
|
||||
requests==2.23.0
|
||||
PyMySQL==0.9.3
|
||||
pysqlite3==0.4.3
|
||||
requests==2.24.0
|
||||
|
13
src/run.py
13
src/run.py
@ -6,7 +6,7 @@ import logging as log
|
||||
|
||||
from studip import Studip
|
||||
from crawler import Crawler
|
||||
from mysql import Database
|
||||
from database import Database
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(description='Download Files from StudIP.')
|
||||
@ -21,14 +21,6 @@ parser.add_argument('--chunk', type=int, default=1024 *
|
||||
1024, help='chunksize for downloading data')
|
||||
parser.add_argument('-r', '--reset_dl_date', action='store_true',
|
||||
help='downloads everything and ignores last download date')
|
||||
parser.add_argument('--host', type=str, default='localhost', help='mysql host')
|
||||
parser.add_argument('--port', type=int, default=3306, help='mysql port')
|
||||
parser.add_argument('--db_name', type=str, default='studip',
|
||||
help='mysql database name')
|
||||
parser.add_argument('--db_user', type=str, default='root',
|
||||
help='mysql database user')
|
||||
parser.add_argument('--db_passwd', type=str,
|
||||
default='secret-pw', help='mysql database password')
|
||||
parser.add_argument('-d', '--debug_output', action='store_true',
|
||||
help='display debug information about the process')
|
||||
parser.add_argument('-q', '--quiet', action='store_true',
|
||||
@ -53,8 +45,7 @@ BASE_DIR = os.path.abspath(args.output)
|
||||
USERNAME = args.user
|
||||
PASSWORD = args.passwd
|
||||
|
||||
db = Database(args.host, args.port, args.db_name,
|
||||
args.db_user, args.db_passwd, args.reset_dl_date)
|
||||
db = Database(args.reset_dl_date)
|
||||
|
||||
studip = Studip(args.chunk, args.url, (USERNAME, PASSWORD), db)
|
||||
|
||||
|
@ -146,13 +146,10 @@ class Studip:
|
||||
last_dl = self.db.get_last_file_dl(doc)
|
||||
if last_dl == None or last_dl < doc_chdate:
|
||||
rsp2 = self.auth_req('/api.php/file/' + doc + '/download')
|
||||
#total_size = int(rsp2.headers.get('content-length', 0))
|
||||
log.info('downloading ' + doc_name)
|
||||
#progbar = tqdm(total=total_size, unit='iB', unit_scale=True)
|
||||
try:
|
||||
with open(doc_name, 'wb') as doc_file:
|
||||
for chunk in rsp2.iter_content(self.CHUNK_SIZE):
|
||||
#progbar.update(len(chunk))
|
||||
doc_file.write(chunk)
|
||||
self.db.set_last_file_dl(str(doc), str(int(time.time())))
|
||||
except OSError:
|
||||
@ -172,7 +169,7 @@ class Studip:
|
||||
try:
|
||||
subdirs = rsp.json()['subfolders']
|
||||
except ValueError:
|
||||
return res_docs
|
||||
return res_subdirs
|
||||
for subdir in subdirs:
|
||||
try:
|
||||
sub_id = subdir['id']
|
||||
|
Loading…
x
Reference in New Issue
Block a user