log: adding basic log with levels and file

The log has options for several levels that can
be set from the command line.
The file is hardcoded as log.txt and can be toggled
master
TiynGER 4 years ago
parent fa36e0f29e
commit 7ef3f063d2

@ -19,9 +19,10 @@ If you run the program again it only downloads files that have changed since the
- [x] Possible reset of download date
- [x] Incremental file download
- [x] Store id and chdate of downloaded files
- [ ] Logging
- [x] Logging
- [x] Console log
- [ ] Log file
- [x] Log file
- [x] Specify log level
## Installation

1
src/.gitignore vendored

@ -1,2 +1,3 @@
__pycache__
data
log.txt

@ -0,0 +1,54 @@
import os
import logging as log
from studip import Studip
class Crawler:
def __init__(self, studip):
self.studip = studip
def download_folder(self, folder):
docs = self.studip.get_docs(folder)
for doc in docs:
log.info('found doc ' + doc)
self.studip.download(doc)
def download_folder_rec(self, folder, base_dir):
log.info('crawling folder ' + folder)
self.create_dir(base_dir)
self.download_folder(folder)
subdirs = self.studip.get_subdirs(folder)
os.chdir(base_dir)
for subdir in subdirs:
subdir_name = subdirs[subdir].replace('/', '-')
subdir_path = os.path.join(base_dir, subdir_name)
log.debug(subdir_path)
self.create_dir(subdir_path)
os.chdir(subdir_path)
self.download_folder_rec(subdir, subdir_path)
def download_course(self, course, base_dir):
log.info('crawling course ' + course)
self.create_dir(base_dir)
os.chdir(base_dir)
root = self.studip.get_top_folder(course)
self.download_folder_rec(root, base_dir)
def download_curr_courses(self, base_dir):
log.info('Start crawling all current courses')
self.create_dir(base_dir)
curr_courses = self.studip.get_curr_courses(
self.studip.get_uid(), self.studip.get_curr_semester())
os.chdir(base_dir)
for course in curr_courses:
log.debug('course is ' + curr_courses[course])
course_name = curr_courses[course].replace('/', '-')
path = os.path.join(base_dir, course_name)
self.download_course(course, path)
def create_dir(self, dir):
if not os.path.exists(dir):
log.info('creating folder' + dir)
os.mkdir(dir)

@ -1,27 +1,27 @@
#!/bin/env python3
import time
import logging as log
import pymysql
class Database:
def __init__(self):
self.HOST = None
self.PORT = None
self.DB_NAME = None
self.USER = None
self.PASSW = None
self.TABLE_FILE = None
def __init__(self, host, port, name, user, passwd, reset_dl):
self.HOST = host
self.PORT = port
self.NAME = name
self.USER = user
self.PASSWD = passwd
self.RESET_DL = reset_dl
self.TABLE_FILE = 'files'
self.RESET_DL_DATE = False
self.setup_db()
def connect(self):
return pymysql.connect(
host=self.HOST,
port=self.PORT,
user=self.USER,
password=self.PASSW,
password=self.PASSWD,
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor
)
@ -29,21 +29,21 @@ class Database:
def setup_db(self):
db = self.connect()
crs = db.cursor()
sql_query = "CREATE DATABASE IF NOT EXISTS " + self.DB_NAME
sql_query = "CREATE DATABASE IF NOT EXISTS " + self.NAME
crs.execute(sql_query)
db.select_db(self.DB_NAME)
db.select_db(self.NAME)
query = "CREATE TABLE IF NOT EXISTS " + self.TABLE_FILE + \
"(id CHAR(32) NOT NULL," + \
"ch_date INT(11) NOT NULL," + \
"PRIMARY KEY(id))"
crs.execute(query)
print(db)
log.debug(db)
def set_last_file_dl(self, file_id, time):
db = self.connect()
db.select_db(self.DB_NAME)
db.select_db(self.NAME)
crs = db.cursor()
print('file: ', file_id, ' time: ', time)
log.debug('file: ' + file_id + ' time: ' + time)
query = "INSERT INTO " + self.TABLE_FILE + "(`id`,`ch_date`)" + \
"VALUES ('" + file_id + "','" + time + "')" + \
"ON DUPLICATE KEY UPDATE `ch_date` = '" + time + "'"
@ -51,10 +51,10 @@ class Database:
db.commit()
def get_last_file_dl(self, file_id):
if self.RESET_DL_DATE:
if self.RESET_DL:
return None
db = self.connect()
db.select_db(self.DB_NAME)
db.select_db(self.NAME)
crs = db.cursor()
query = "SELECT ch_date FROM files WHERE id ='" + file_id + "'"
crs.execute(query)

@ -1,9 +1,12 @@
#!/bin/env python3
import os
import sys
import argparse
import logging as log
import studip
import mysql
from studip import Studip
from crawler import Crawler
from mysql import Database
parser = argparse.ArgumentParser(description='Download Files from StudIP.')
@ -16,32 +19,49 @@ parser.add_argument('-p', '--passwd', type=str,
parser.add_argument('-s', '--url', type=str, help='studip url', required=True)
parser.add_argument('--chunk', type=int, default=1024 *
1024, help='chunksize for downloading data')
parser.add_argument('-r', '--reset_dl_date', action='store_true', help='downloads everything and ignores last download date')
parser.add_argument('-r', '--reset_dl_date', action='store_true',
help='downloads everything and ignores last download date')
parser.add_argument('--host', type=str, default='localhost', help='mysql host')
parser.add_argument('--port', type=int, default=3306, help='mysql port')
parser.add_argument('--db_name', type=str, default='studip', help='mysql database name')
parser.add_argument('--db_user', type=str, default='root', help='mysql database user')
parser.add_argument('--db_passwd', type=str, default='secret-pw', help='mysql database password')
parser.add_argument('--db_name', type=str, default='studip',
help='mysql database name')
parser.add_argument('--db_user', type=str, default='root',
help='mysql database user')
parser.add_argument('--db_passwd', type=str,
default='secret-pw', help='mysql database password')
parser.add_argument('-d', '--debug_output', action='store_true',
help='display debug information about the process')
parser.add_argument('-q', '--quiet', action='store_true',
help='only display most important output')
parser.add_argument('-l', '--log_file', action='store_true',
help='saves log to a log file named "log.txt"')
args = parser.parse_args()
if args.quiet:
log_level = log.WARNING
elif args.debug_output:
log_level = log.DEBUG
else:
log_level = log.INFO
if args.log_file:
log.basicConfig(level=log_level, filename='log.txt')
else:
log.basicConfig(level=log_level)
BASE_DIR = os.path.abspath(args.output)
USERNAME = args.user
PASSWORD = args.passwd
db = mysql.Database()
db.HOST = args.host
db.PORT = args.port
db.DB_NAME = args.db_name
db.USER = args.db_user
db.PASSW = args.db_passwd
db.RESET_DL_DATE = args.reset_dl_date
db.setup_db()
db = Database(args.host, args.port, args.db_name,
args.db_user, args.db_passwd, args.reset_dl_date)
crwlr = studip.Crawler(db)
studip = Studip(args.chunk, args.url, (USERNAME, PASSWORD), db)
crwlr.CHUNK_SIZE = args.chunk
crwlr.STUDIP_DOMAIN = args.url
crwlr.USER = (USERNAME, PASSWORD)
crawler = Crawler(studip)
crwlr.download_curr_courses(BASE_DIR)
# Start crawling
try:
crawler.download_curr_courses(BASE_DIR)
except KeyboardInterrupt:
sys.exit(0)

@ -1,35 +1,30 @@
#!/bin/env python3
import time
import os
import argparse
import logging as log
from tqdm import tqdm
import requests as req
from requests.auth import HTTPBasicAuth
class Crawler:
class Studip:
def __init__(self, db):
self.CHUNK_SIZE = None
self.STUDIP_DOMAIN = None
self.USER = None
def __init__(self, chunk_size, domain, user, db):
self.CHUNK_SIZE = chunk_size
self.DOMAIN = domain
self.USER = user
self.db = db
def create_dir(self, dir):
if not os.path.exists(dir):
print('creating folder', dir)
os.mkdir(dir)
def auth_req(self, url):
url = self.DOMAIN + url
return req.get(url, auth=self.USER)
def get_uid(self):
url = self.STUDIP_DOMAIN + '/api.php/user/'
rsp = req.get(url, auth=self.USER)
rsp = self.auth_req('/api.php/user/')
user_id = rsp.json()['user_id']
return user_id
def get_curr_semester(self):
url = self.STUDIP_DOMAIN + '/api.php/semesters/'
rsp = req.get(url, auth=self.USER)
rsp = self.auth_req('/api.php/semesters/')
curr_time = int(str(int(time.time())))
semesters = rsp.json()['collection']
for sem_uri in semesters:
@ -41,8 +36,7 @@ class Crawler:
return 0
def get_ordered_semesters(self):
url = self.STUDIP_DOMAIN + '/api.php/semesters/'
rsp = req.get(url, auth=self.USER)
rsp = self.auth_req('/api.php/semesters/')
semesters = rsp.json()['collection']
order_sems = []
for sem_uri in semesters:
@ -50,8 +44,7 @@ class Crawler:
return order_sems
def get_curr_courses(self, user_id, semester):
url = self.STUDIP_DOMAIN + '/api.php/user/' + user_id + '/courses'
rsp = req.get(url, auth=self.USER)
rsp = self.auth_req('/api.php/user/' + user_id + '/courses')
ord_sems = self.get_ordered_semesters()
courses = rsp.json()['collection']
i = 0
@ -76,15 +69,13 @@ class Crawler:
return course_list
def get_top_folder(self, course):
url = self.STUDIP_DOMAIN + '/api.php/course/' + course + '/top_folder'
rsp = req.get(url, auth=self.USER)
rsp = self.auth_req('/api.php/course/' + course + '/top_folder')
top_folder = rsp.json()
tf_id = top_folder['id']
return(tf_id)
def get_docs(self, folder):
url = self.STUDIP_DOMAIN + '/api.php/folder/' + folder
rsp = req.get(url, auth=self.USER)
rsp = self.auth_req('/api.php/folder/' + folder)
docs = rsp.json()['file_refs']
res_docs = []
for doc in docs:
@ -93,16 +84,14 @@ class Crawler:
return(res_docs)
def download(self, doc):
url1 = self.STUDIP_DOMAIN + '/api.php/file/' + doc
rsp1 = req.get(url1, auth=self.USER)
rsp1 = self.auth_req('/api.php/file/' + doc)
doc_name = rsp1.json()['name']
doc_chdate = rsp1.json()['chdate']
last_dl = self.db.get_last_file_dl(doc)
if last_dl == None or last_dl < doc_chdate:
print('downloading ', doc_name)
url2 = self.STUDIP_DOMAIN + '/api.php/file/' + doc + '/download'
rsp2 = req.get(url2, auth=self.USER, stream=True)
rsp2 = self.auth_req('/api.php/file/' + doc + '/download')
total_size = int(rsp2.headers.get('content-length', 0))
print('downloading ' + doc_name)
progbar = tqdm(total=total_size, unit='iB', unit_scale=True)
with open(doc_name, 'wb') as doc_file:
for chunk in rsp2.iter_content(self.CHUNK_SIZE):
@ -111,8 +100,7 @@ class Crawler:
self.db.set_last_file_dl(str(doc), str(int(time.time())))
def get_subdirs(self, folder):
url = self.STUDIP_DOMAIN + '/api.php/folder/' + folder
rsp = req.get(url, auth=self.USER)
rsp = self.auth_req('/api.php/folder/' + folder)
subdirs = rsp.json()['subfolders']
docs = rsp.json()['file_refs']
res_subdirs = {}
@ -121,42 +109,3 @@ class Crawler:
sub_name = subdir['name']
res_subdirs[sub_id] = sub_name
return res_subdirs
def download_folder(self, folder):
docs = self.get_docs(folder)
for doc in docs:
print('found doc ', doc)
self.download(doc)
def download_folder_rec(self, folder, base_dir):
print('folder ', folder)
self.create_dir(base_dir)
self.download_folder(folder)
subdirs = self.get_subdirs(folder)
os.chdir(base_dir)
for subdir in subdirs:
subdir_name = subdirs[subdir].replace('/', '-')
subdir_path = os.path.join(base_dir, subdir_name)
print(subdir_path)
self.create_dir(subdir_path)
os.chdir(subdir_path)
self.download_folder_rec(subdir, subdir_path)
def download_course(self, course, base_dir):
print('course ', course)
self.create_dir(base_dir)
os.chdir(base_dir)
root = self.get_top_folder(course)
self.download_folder_rec(root, base_dir)
def download_curr_courses(self, base_dir):
print('Start downloading all current courses')
self.create_dir(base_dir)
curr_courses = self.get_curr_courses(
self.get_uid(), self.get_curr_semester())
os.chdir(base_dir)
for course in curr_courses:
print('course is ', curr_courses[course])
course_name = curr_courses[course].replace('/', '-')
path = os.path.join(base_dir, course_name)
self.download_course(course, path)

Loading…
Cancel
Save