log: adding basic log with levels and file

The log has options for several levels that can
be set from the command line.
The file is hardcoded as log.txt and can be toggled
master
TiynGER 5 years ago
parent fa36e0f29e
commit 7ef3f063d2

@ -19,9 +19,10 @@ If you run the program again it only downloads files that have changed since the
- [x] Possible reset of download date - [x] Possible reset of download date
- [x] Incremental file download - [x] Incremental file download
- [x] Store id and chdate of downloaded files - [x] Store id and chdate of downloaded files
- [ ] Logging - [x] Logging
- [x] Console log - [x] Console log
- [ ] Log file - [x] Log file
- [x] Specify log level
## Installation ## Installation

1
src/.gitignore vendored

@ -1,2 +1,3 @@
__pycache__ __pycache__
data data
log.txt

@ -0,0 +1,54 @@
import os
import logging as log
from studip import Studip
class Crawler:
def __init__(self, studip):
self.studip = studip
def download_folder(self, folder):
docs = self.studip.get_docs(folder)
for doc in docs:
log.info('found doc ' + doc)
self.studip.download(doc)
def download_folder_rec(self, folder, base_dir):
log.info('crawling folder ' + folder)
self.create_dir(base_dir)
self.download_folder(folder)
subdirs = self.studip.get_subdirs(folder)
os.chdir(base_dir)
for subdir in subdirs:
subdir_name = subdirs[subdir].replace('/', '-')
subdir_path = os.path.join(base_dir, subdir_name)
log.debug(subdir_path)
self.create_dir(subdir_path)
os.chdir(subdir_path)
self.download_folder_rec(subdir, subdir_path)
def download_course(self, course, base_dir):
log.info('crawling course ' + course)
self.create_dir(base_dir)
os.chdir(base_dir)
root = self.studip.get_top_folder(course)
self.download_folder_rec(root, base_dir)
def download_curr_courses(self, base_dir):
log.info('Start crawling all current courses')
self.create_dir(base_dir)
curr_courses = self.studip.get_curr_courses(
self.studip.get_uid(), self.studip.get_curr_semester())
os.chdir(base_dir)
for course in curr_courses:
log.debug('course is ' + curr_courses[course])
course_name = curr_courses[course].replace('/', '-')
path = os.path.join(base_dir, course_name)
self.download_course(course, path)
def create_dir(self, dir):
if not os.path.exists(dir):
log.info('creating folder' + dir)
os.mkdir(dir)

@ -1,27 +1,27 @@
#!/bin/env python3
import time import time
import logging as log
import pymysql import pymysql
class Database: class Database:
def __init__(self): def __init__(self, host, port, name, user, passwd, reset_dl):
self.HOST = None self.HOST = host
self.PORT = None self.PORT = port
self.DB_NAME = None self.NAME = name
self.USER = None self.USER = user
self.PASSW = None self.PASSWD = passwd
self.TABLE_FILE = None self.RESET_DL = reset_dl
self.TABLE_FILE = 'files' self.TABLE_FILE = 'files'
self.RESET_DL_DATE = False self.setup_db()
def connect(self): def connect(self):
return pymysql.connect( return pymysql.connect(
host=self.HOST, host=self.HOST,
port=self.PORT, port=self.PORT,
user=self.USER, user=self.USER,
password=self.PASSW, password=self.PASSWD,
charset='utf8mb4', charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor cursorclass=pymysql.cursors.DictCursor
) )
@ -29,21 +29,21 @@ class Database:
def setup_db(self): def setup_db(self):
db = self.connect() db = self.connect()
crs = db.cursor() crs = db.cursor()
sql_query = "CREATE DATABASE IF NOT EXISTS " + self.DB_NAME sql_query = "CREATE DATABASE IF NOT EXISTS " + self.NAME
crs.execute(sql_query) crs.execute(sql_query)
db.select_db(self.DB_NAME) db.select_db(self.NAME)
query = "CREATE TABLE IF NOT EXISTS " + self.TABLE_FILE + \ query = "CREATE TABLE IF NOT EXISTS " + self.TABLE_FILE + \
"(id CHAR(32) NOT NULL," + \ "(id CHAR(32) NOT NULL," + \
"ch_date INT(11) NOT NULL," + \ "ch_date INT(11) NOT NULL," + \
"PRIMARY KEY(id))" "PRIMARY KEY(id))"
crs.execute(query) crs.execute(query)
print(db) log.debug(db)
def set_last_file_dl(self, file_id, time): def set_last_file_dl(self, file_id, time):
db = self.connect() db = self.connect()
db.select_db(self.DB_NAME) db.select_db(self.NAME)
crs = db.cursor() crs = db.cursor()
print('file: ', file_id, ' time: ', time) log.debug('file: ' + file_id + ' time: ' + time)
query = "INSERT INTO " + self.TABLE_FILE + "(`id`,`ch_date`)" + \ query = "INSERT INTO " + self.TABLE_FILE + "(`id`,`ch_date`)" + \
"VALUES ('" + file_id + "','" + time + "')" + \ "VALUES ('" + file_id + "','" + time + "')" + \
"ON DUPLICATE KEY UPDATE `ch_date` = '" + time + "'" "ON DUPLICATE KEY UPDATE `ch_date` = '" + time + "'"
@ -51,10 +51,10 @@ class Database:
db.commit() db.commit()
def get_last_file_dl(self, file_id): def get_last_file_dl(self, file_id):
if self.RESET_DL_DATE: if self.RESET_DL:
return None return None
db = self.connect() db = self.connect()
db.select_db(self.DB_NAME) db.select_db(self.NAME)
crs = db.cursor() crs = db.cursor()
query = "SELECT ch_date FROM files WHERE id ='" + file_id + "'" query = "SELECT ch_date FROM files WHERE id ='" + file_id + "'"
crs.execute(query) crs.execute(query)

@ -1,9 +1,12 @@
#!/bin/env python3 #!/bin/env python3
import os import os
import sys
import argparse import argparse
import logging as log
import studip from studip import Studip
import mysql from crawler import Crawler
from mysql import Database
parser = argparse.ArgumentParser(description='Download Files from StudIP.') parser = argparse.ArgumentParser(description='Download Files from StudIP.')
@ -16,32 +19,49 @@ parser.add_argument('-p', '--passwd', type=str,
parser.add_argument('-s', '--url', type=str, help='studip url', required=True) parser.add_argument('-s', '--url', type=str, help='studip url', required=True)
parser.add_argument('--chunk', type=int, default=1024 * parser.add_argument('--chunk', type=int, default=1024 *
1024, help='chunksize for downloading data') 1024, help='chunksize for downloading data')
parser.add_argument('-r', '--reset_dl_date', action='store_true', help='downloads everything and ignores last download date') parser.add_argument('-r', '--reset_dl_date', action='store_true',
help='downloads everything and ignores last download date')
parser.add_argument('--host', type=str, default='localhost', help='mysql host') parser.add_argument('--host', type=str, default='localhost', help='mysql host')
parser.add_argument('--port', type=int, default=3306, help='mysql port') parser.add_argument('--port', type=int, default=3306, help='mysql port')
parser.add_argument('--db_name', type=str, default='studip', help='mysql database name') parser.add_argument('--db_name', type=str, default='studip',
parser.add_argument('--db_user', type=str, default='root', help='mysql database user') help='mysql database name')
parser.add_argument('--db_passwd', type=str, default='secret-pw', help='mysql database password') parser.add_argument('--db_user', type=str, default='root',
help='mysql database user')
parser.add_argument('--db_passwd', type=str,
default='secret-pw', help='mysql database password')
parser.add_argument('-d', '--debug_output', action='store_true',
help='display debug information about the process')
parser.add_argument('-q', '--quiet', action='store_true',
help='only display most important output')
parser.add_argument('-l', '--log_file', action='store_true',
help='saves log to a log file named "log.txt"')
args = parser.parse_args() args = parser.parse_args()
if args.quiet:
log_level = log.WARNING
elif args.debug_output:
log_level = log.DEBUG
else:
log_level = log.INFO
if args.log_file:
log.basicConfig(level=log_level, filename='log.txt')
else:
log.basicConfig(level=log_level)
BASE_DIR = os.path.abspath(args.output) BASE_DIR = os.path.abspath(args.output)
USERNAME = args.user USERNAME = args.user
PASSWORD = args.passwd PASSWORD = args.passwd
db = mysql.Database() db = Database(args.host, args.port, args.db_name,
args.db_user, args.db_passwd, args.reset_dl_date)
db.HOST = args.host
db.PORT = args.port
db.DB_NAME = args.db_name
db.USER = args.db_user
db.PASSW = args.db_passwd
db.RESET_DL_DATE = args.reset_dl_date
db.setup_db()
crwlr = studip.Crawler(db) studip = Studip(args.chunk, args.url, (USERNAME, PASSWORD), db)
crwlr.CHUNK_SIZE = args.chunk crawler = Crawler(studip)
crwlr.STUDIP_DOMAIN = args.url
crwlr.USER = (USERNAME, PASSWORD)
crwlr.download_curr_courses(BASE_DIR) # Start crawling
try:
crawler.download_curr_courses(BASE_DIR)
except KeyboardInterrupt:
sys.exit(0)

@ -1,35 +1,30 @@
#!/bin/env python3
import time import time
import os import logging as log
import argparse
from tqdm import tqdm from tqdm import tqdm
import requests as req import requests as req
from requests.auth import HTTPBasicAuth from requests.auth import HTTPBasicAuth
class Crawler: class Studip:
def __init__(self, db): def __init__(self, chunk_size, domain, user, db):
self.CHUNK_SIZE = None self.CHUNK_SIZE = chunk_size
self.STUDIP_DOMAIN = None self.DOMAIN = domain
self.USER = None self.USER = user
self.db = db self.db = db
def create_dir(self, dir): def auth_req(self, url):
if not os.path.exists(dir): url = self.DOMAIN + url
print('creating folder', dir) return req.get(url, auth=self.USER)
os.mkdir(dir)
def get_uid(self): def get_uid(self):
url = self.STUDIP_DOMAIN + '/api.php/user/' rsp = self.auth_req('/api.php/user/')
rsp = req.get(url, auth=self.USER)
user_id = rsp.json()['user_id'] user_id = rsp.json()['user_id']
return user_id return user_id
def get_curr_semester(self): def get_curr_semester(self):
url = self.STUDIP_DOMAIN + '/api.php/semesters/' rsp = self.auth_req('/api.php/semesters/')
rsp = req.get(url, auth=self.USER)
curr_time = int(str(int(time.time()))) curr_time = int(str(int(time.time())))
semesters = rsp.json()['collection'] semesters = rsp.json()['collection']
for sem_uri in semesters: for sem_uri in semesters:
@ -41,8 +36,7 @@ class Crawler:
return 0 return 0
def get_ordered_semesters(self): def get_ordered_semesters(self):
url = self.STUDIP_DOMAIN + '/api.php/semesters/' rsp = self.auth_req('/api.php/semesters/')
rsp = req.get(url, auth=self.USER)
semesters = rsp.json()['collection'] semesters = rsp.json()['collection']
order_sems = [] order_sems = []
for sem_uri in semesters: for sem_uri in semesters:
@ -50,8 +44,7 @@ class Crawler:
return order_sems return order_sems
def get_curr_courses(self, user_id, semester): def get_curr_courses(self, user_id, semester):
url = self.STUDIP_DOMAIN + '/api.php/user/' + user_id + '/courses' rsp = self.auth_req('/api.php/user/' + user_id + '/courses')
rsp = req.get(url, auth=self.USER)
ord_sems = self.get_ordered_semesters() ord_sems = self.get_ordered_semesters()
courses = rsp.json()['collection'] courses = rsp.json()['collection']
i = 0 i = 0
@ -76,15 +69,13 @@ class Crawler:
return course_list return course_list
def get_top_folder(self, course): def get_top_folder(self, course):
url = self.STUDIP_DOMAIN + '/api.php/course/' + course + '/top_folder' rsp = self.auth_req('/api.php/course/' + course + '/top_folder')
rsp = req.get(url, auth=self.USER)
top_folder = rsp.json() top_folder = rsp.json()
tf_id = top_folder['id'] tf_id = top_folder['id']
return(tf_id) return(tf_id)
def get_docs(self, folder): def get_docs(self, folder):
url = self.STUDIP_DOMAIN + '/api.php/folder/' + folder rsp = self.auth_req('/api.php/folder/' + folder)
rsp = req.get(url, auth=self.USER)
docs = rsp.json()['file_refs'] docs = rsp.json()['file_refs']
res_docs = [] res_docs = []
for doc in docs: for doc in docs:
@ -93,16 +84,14 @@ class Crawler:
return(res_docs) return(res_docs)
def download(self, doc): def download(self, doc):
url1 = self.STUDIP_DOMAIN + '/api.php/file/' + doc rsp1 = self.auth_req('/api.php/file/' + doc)
rsp1 = req.get(url1, auth=self.USER)
doc_name = rsp1.json()['name'] doc_name = rsp1.json()['name']
doc_chdate = rsp1.json()['chdate'] doc_chdate = rsp1.json()['chdate']
last_dl = self.db.get_last_file_dl(doc) last_dl = self.db.get_last_file_dl(doc)
if last_dl == None or last_dl < doc_chdate: if last_dl == None or last_dl < doc_chdate:
print('downloading ', doc_name) rsp2 = self.auth_req('/api.php/file/' + doc + '/download')
url2 = self.STUDIP_DOMAIN + '/api.php/file/' + doc + '/download'
rsp2 = req.get(url2, auth=self.USER, stream=True)
total_size = int(rsp2.headers.get('content-length', 0)) total_size = int(rsp2.headers.get('content-length', 0))
print('downloading ' + doc_name)
progbar = tqdm(total=total_size, unit='iB', unit_scale=True) progbar = tqdm(total=total_size, unit='iB', unit_scale=True)
with open(doc_name, 'wb') as doc_file: with open(doc_name, 'wb') as doc_file:
for chunk in rsp2.iter_content(self.CHUNK_SIZE): for chunk in rsp2.iter_content(self.CHUNK_SIZE):
@ -111,8 +100,7 @@ class Crawler:
self.db.set_last_file_dl(str(doc), str(int(time.time()))) self.db.set_last_file_dl(str(doc), str(int(time.time())))
def get_subdirs(self, folder): def get_subdirs(self, folder):
url = self.STUDIP_DOMAIN + '/api.php/folder/' + folder rsp = self.auth_req('/api.php/folder/' + folder)
rsp = req.get(url, auth=self.USER)
subdirs = rsp.json()['subfolders'] subdirs = rsp.json()['subfolders']
docs = rsp.json()['file_refs'] docs = rsp.json()['file_refs']
res_subdirs = {} res_subdirs = {}
@ -121,42 +109,3 @@ class Crawler:
sub_name = subdir['name'] sub_name = subdir['name']
res_subdirs[sub_id] = sub_name res_subdirs[sub_id] = sub_name
return res_subdirs return res_subdirs
def download_folder(self, folder):
docs = self.get_docs(folder)
for doc in docs:
print('found doc ', doc)
self.download(doc)
def download_folder_rec(self, folder, base_dir):
print('folder ', folder)
self.create_dir(base_dir)
self.download_folder(folder)
subdirs = self.get_subdirs(folder)
os.chdir(base_dir)
for subdir in subdirs:
subdir_name = subdirs[subdir].replace('/', '-')
subdir_path = os.path.join(base_dir, subdir_name)
print(subdir_path)
self.create_dir(subdir_path)
os.chdir(subdir_path)
self.download_folder_rec(subdir, subdir_path)
def download_course(self, course, base_dir):
print('course ', course)
self.create_dir(base_dir)
os.chdir(base_dir)
root = self.get_top_folder(course)
self.download_folder_rec(root, base_dir)
def download_curr_courses(self, base_dir):
print('Start downloading all current courses')
self.create_dir(base_dir)
curr_courses = self.get_curr_courses(
self.get_uid(), self.get_curr_semester())
os.chdir(base_dir)
for course in curr_courses:
print('course is ', curr_courses[course])
course_name = curr_courses[course].replace('/', '-')
path = os.path.join(base_dir, course_name)
self.download_course(course, path)

Loading…
Cancel
Save