mirror of
https://github.com/tiyn/stud.ip-crawler.git
synced 2025-04-01 15:37:47 +02:00
database: switching from mysql to sqlite
Currently we only need the db to keep track of the files that were already downloaded, thus we only use one database and only one table. A complete sql database is a bit too bulky for this purpose. By using sqlite we can minimize and embed the db.
This commit is contained in:
parent
415a21da3b
commit
4a8cf45ad3
21
README.md
21
README.md
@ -56,21 +56,20 @@ Run `python3 run.py -h` for a help menu and see which ones are important for you
|
|||||||
#### Environment-variables
|
#### Environment-variables
|
||||||
|
|
||||||
Set the following variables with the -e tag.
|
Set the following variables with the -e tag.
|
||||||
| Name | Usage | Default |
|
|
||||||
| ---- |------ | ------- |
|
| Name | Usage | Default |
|
||||||
| `USER` | username on the studip server | `admin` |
|
| ---------- | ----------------------------- | ------- |
|
||||||
| `PSWD` | password on the studip server | `admin` |
|
| `USER` | username on the studip server | `admin` |
|
||||||
| `URL` | url of the studip server | `admin` |
|
| `PSWD` | password on the studip server | `admin` |
|
||||||
| `HOST` | ip of the mysql instance to connect | `mysql` |
|
| `URL` | url of the studip server | `admin` |
|
||||||
| `DB_USER` | username of the mysql instance to connect | `root` |
|
| `INTERVAl` | update interval in seconds | `86400` |
|
||||||
| `DB_PSWD` | password of the mysql instance to connect | `root` |
|
|
||||||
| `INTERVAl` | update interval in seconds | `86400` |
|
|
||||||
|
|
||||||
#### Volumes
|
#### Volumes
|
||||||
|
|
||||||
Set the following volumes with the -v tag.
|
Set the following volumes with the -v tag.
|
||||||
| Volume-Name | Container mount | Description |
|
|
||||||
| ----------- | --------------- | ----------------------------------- |
|
| Volume-Name | Container mount | Description |
|
||||||
|
| ------------- | ------------------ | ----------------------------------------- |
|
||||||
| `studip_data` | `/studip/src/data` | directory for studip files to be saved to |
|
| `studip_data` | `/studip/src/data` | directory for studip files to be saved to |
|
||||||
|
|
||||||
#### docker-compose.yml
|
#### docker-compose.yml
|
||||||
|
@ -8,24 +8,9 @@ services:
|
|||||||
PSWD: 'pswd'
|
PSWD: 'pswd'
|
||||||
URL: 'https://url.tld'
|
URL: 'https://url.tld'
|
||||||
INTERVAL: 86400
|
INTERVAL: 86400
|
||||||
HOST: 'mysql'
|
|
||||||
DB_USER: root
|
|
||||||
DB_PSWD: 'pswddb'
|
|
||||||
volumes:
|
volumes:
|
||||||
- studip:/studip/data
|
- studip:/studip/data
|
||||||
depends_on:
|
|
||||||
- mysql
|
|
||||||
|
|
||||||
mysql:
|
|
||||||
image: mysql:5.6
|
|
||||||
restart: unless-stopped
|
|
||||||
environment:
|
|
||||||
MYSQL_ROOT_PASSWORD: 'pswddb'
|
|
||||||
volumes:
|
|
||||||
- mysql:/var/lib/mysql
|
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
studip:
|
studip:
|
||||||
driver: local
|
driver: local
|
||||||
mysql:
|
|
||||||
driver: local
|
|
||||||
|
@ -1,3 +1,3 @@
|
|||||||
#!/bin/bash
|
#!/bin/sh
|
||||||
|
|
||||||
while true; do python /studip/run.py -o /studip/data -u $USER -p $PSWD -s $URL --db_user $DB_USER --db_passwd $DB_PSWD --host $HOST && sleep $INTERVAL; done
|
while true; do python /studip/run.py -o /studip/data -u $USER -p $PSWD -s $URL && sleep $INTERVAL; done
|
||||||
|
@ -1,41 +1,30 @@
|
|||||||
import time
|
import time
|
||||||
import logging as log
|
import logging as log
|
||||||
|
import os
|
||||||
|
|
||||||
import pymysql
|
import pysqlite3
|
||||||
|
|
||||||
|
|
||||||
class Database:
|
class Database:
|
||||||
|
|
||||||
def __init__(self, host, port, name, user, passwd, reset_dl):
|
def __init__(self, reset_dl):
|
||||||
self.HOST = host
|
|
||||||
self.PORT = port
|
|
||||||
self.NAME = name
|
|
||||||
self.USER = user
|
|
||||||
self.PASSWD = passwd
|
|
||||||
self.RESET_DL = reset_dl
|
self.RESET_DL = reset_dl
|
||||||
self.TABLE_FILE = 'files'
|
self.TABLE_FILE = 'files'
|
||||||
|
self.DB_DIR = os.path.dirname(os.path.realpath(__file__))
|
||||||
self.setup_db()
|
self.setup_db()
|
||||||
|
|
||||||
def connect(self):
|
def connect(self):
|
||||||
"""Connect to an existing database instance based on the object attributes.
|
"""Connect to an existing database instance based on the object
|
||||||
|
attributes.
|
||||||
"""
|
"""
|
||||||
return pymysql.connect(
|
path = os.path.join(self.DB_DIR, "data.db")
|
||||||
host=self.HOST,
|
return pysqlite3.connect(path)
|
||||||
port=self.PORT,
|
|
||||||
user=self.USER,
|
|
||||||
password=self.PASSWD,
|
|
||||||
charset='utf8mb4',
|
|
||||||
cursorclass=pymysql.cursors.DictCursor
|
|
||||||
)
|
|
||||||
|
|
||||||
def setup_db(self):
|
def setup_db(self):
|
||||||
"""Creates a database with tables.
|
"""Creates a database with tables."""
|
||||||
"""
|
log.info("check database")
|
||||||
db = self.connect()
|
db = self.connect()
|
||||||
crs = db.cursor()
|
crs = db.cursor()
|
||||||
sql_query = "CREATE DATABASE IF NOT EXISTS " + self.NAME
|
|
||||||
crs.execute(sql_query)
|
|
||||||
db.select_db(self.NAME)
|
|
||||||
query = "CREATE TABLE IF NOT EXISTS " + self.TABLE_FILE + \
|
query = "CREATE TABLE IF NOT EXISTS " + self.TABLE_FILE + \
|
||||||
"(id CHAR(32) NOT NULL," + \
|
"(id CHAR(32) NOT NULL," + \
|
||||||
"ch_date INT(11) NOT NULL," + \
|
"ch_date INT(11) NOT NULL," + \
|
||||||
@ -51,12 +40,11 @@ class Database:
|
|||||||
time(int): time the file was downloaded
|
time(int): time the file was downloaded
|
||||||
"""
|
"""
|
||||||
db = self.connect()
|
db = self.connect()
|
||||||
db.select_db(self.NAME)
|
|
||||||
crs = db.cursor()
|
crs = db.cursor()
|
||||||
log.debug('file: ' + file_id + ' time: ' + time)
|
log.debug('file: ' + file_id + ' time: ' + time)
|
||||||
query = "INSERT INTO " + self.TABLE_FILE + "(`id`,`ch_date`)" + \
|
query = "INSERT INTO " + self.TABLE_FILE + "(`id`,`ch_date`)" + \
|
||||||
"VALUES ('" + file_id + "','" + time + "')" + \
|
"VALUES ('" + file_id + "','" + time + "')" + \
|
||||||
"ON DUPLICATE KEY UPDATE `ch_date` = '" + time + "'"
|
"ON CONFLICT(`id`) DO UPDATE SET `ch_date` = '" + time + "'"
|
||||||
crs.execute(query)
|
crs.execute(query)
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
@ -72,11 +60,10 @@ class Database:
|
|||||||
if self.RESET_DL:
|
if self.RESET_DL:
|
||||||
return None
|
return None
|
||||||
db = self.connect()
|
db = self.connect()
|
||||||
db.select_db(self.NAME)
|
|
||||||
crs = db.cursor()
|
crs = db.cursor()
|
||||||
query = "SELECT ch_date FROM files WHERE id ='" + file_id + "'"
|
query = "SELECT ch_date FROM files WHERE id ='" + file_id + "'"
|
||||||
crs.execute(query)
|
crs.execute(query)
|
||||||
res = crs.fetchone()
|
res = crs.fetchone()
|
||||||
if res != None:
|
if res != None:
|
||||||
return res['ch_date']
|
return res[0]
|
||||||
return None
|
return None
|
@ -1,3 +1,2 @@
|
|||||||
tqdm==4.46.1
|
pysqlite3==0.4.3
|
||||||
requests==2.23.0
|
requests==2.24.0
|
||||||
PyMySQL==0.9.3
|
|
||||||
|
13
src/run.py
13
src/run.py
@ -6,7 +6,7 @@ import logging as log
|
|||||||
|
|
||||||
from studip import Studip
|
from studip import Studip
|
||||||
from crawler import Crawler
|
from crawler import Crawler
|
||||||
from mysql import Database
|
from database import Database
|
||||||
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Download Files from StudIP.')
|
parser = argparse.ArgumentParser(description='Download Files from StudIP.')
|
||||||
@ -21,14 +21,6 @@ parser.add_argument('--chunk', type=int, default=1024 *
|
|||||||
1024, help='chunksize for downloading data')
|
1024, help='chunksize for downloading data')
|
||||||
parser.add_argument('-r', '--reset_dl_date', action='store_true',
|
parser.add_argument('-r', '--reset_dl_date', action='store_true',
|
||||||
help='downloads everything and ignores last download date')
|
help='downloads everything and ignores last download date')
|
||||||
parser.add_argument('--host', type=str, default='localhost', help='mysql host')
|
|
||||||
parser.add_argument('--port', type=int, default=3306, help='mysql port')
|
|
||||||
parser.add_argument('--db_name', type=str, default='studip',
|
|
||||||
help='mysql database name')
|
|
||||||
parser.add_argument('--db_user', type=str, default='root',
|
|
||||||
help='mysql database user')
|
|
||||||
parser.add_argument('--db_passwd', type=str,
|
|
||||||
default='secret-pw', help='mysql database password')
|
|
||||||
parser.add_argument('-d', '--debug_output', action='store_true',
|
parser.add_argument('-d', '--debug_output', action='store_true',
|
||||||
help='display debug information about the process')
|
help='display debug information about the process')
|
||||||
parser.add_argument('-q', '--quiet', action='store_true',
|
parser.add_argument('-q', '--quiet', action='store_true',
|
||||||
@ -53,8 +45,7 @@ BASE_DIR = os.path.abspath(args.output)
|
|||||||
USERNAME = args.user
|
USERNAME = args.user
|
||||||
PASSWORD = args.passwd
|
PASSWORD = args.passwd
|
||||||
|
|
||||||
db = Database(args.host, args.port, args.db_name,
|
db = Database(args.reset_dl_date)
|
||||||
args.db_user, args.db_passwd, args.reset_dl_date)
|
|
||||||
|
|
||||||
studip = Studip(args.chunk, args.url, (USERNAME, PASSWORD), db)
|
studip = Studip(args.chunk, args.url, (USERNAME, PASSWORD), db)
|
||||||
|
|
||||||
|
@ -146,13 +146,10 @@ class Studip:
|
|||||||
last_dl = self.db.get_last_file_dl(doc)
|
last_dl = self.db.get_last_file_dl(doc)
|
||||||
if last_dl == None or last_dl < doc_chdate:
|
if last_dl == None or last_dl < doc_chdate:
|
||||||
rsp2 = self.auth_req('/api.php/file/' + doc + '/download')
|
rsp2 = self.auth_req('/api.php/file/' + doc + '/download')
|
||||||
#total_size = int(rsp2.headers.get('content-length', 0))
|
|
||||||
log.info('downloading ' + doc_name)
|
log.info('downloading ' + doc_name)
|
||||||
#progbar = tqdm(total=total_size, unit='iB', unit_scale=True)
|
|
||||||
try:
|
try:
|
||||||
with open(doc_name, 'wb') as doc_file:
|
with open(doc_name, 'wb') as doc_file:
|
||||||
for chunk in rsp2.iter_content(self.CHUNK_SIZE):
|
for chunk in rsp2.iter_content(self.CHUNK_SIZE):
|
||||||
#progbar.update(len(chunk))
|
|
||||||
doc_file.write(chunk)
|
doc_file.write(chunk)
|
||||||
self.db.set_last_file_dl(str(doc), str(int(time.time())))
|
self.db.set_last_file_dl(str(doc), str(int(time.time())))
|
||||||
except OSError:
|
except OSError:
|
||||||
@ -172,7 +169,7 @@ class Studip:
|
|||||||
try:
|
try:
|
||||||
subdirs = rsp.json()['subfolders']
|
subdirs = rsp.json()['subfolders']
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return res_docs
|
return res_subdirs
|
||||||
for subdir in subdirs:
|
for subdir in subdirs:
|
||||||
try:
|
try:
|
||||||
sub_id = subdir['id']
|
sub_id = subdir['id']
|
||||||
|
Loading…
x
Reference in New Issue
Block a user