adding fulltextsearch with searchpage

2025-08-08 11:37:48 +02:00 · 2020-06-01 20:51:37 +02:00 · 2020-06-01 20:51:37 +02:00 · 3c49ea6687
commit 3c49ea6687
parent 51682d00f0
17 changed files with 233 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -7,16 +7,22 @@ I however just want to put my markdown files in a directory and get a working wi

 ## Features/To-Dos

- [ ] Plain text support for blog entries
-    - [ ] Markdown Files (.md)
- [ ] Entry page
+- [x] Plain text support for blog entries
+    - [x] Markdown Files (.md)
+- [x] Entry page
+    - [ ] Option to get plain text file
+- [x] Search page
+    - [x] Full-text search
+    - [ ] Show first few lines of each match (description)
 - [ ] Navigation
-    - [ ] Header
-    - [ ] Footer
- [ ] Switchable CSS
-    - [ ] CSS dark-theme
-    - [ ] CSS light-theme
- [ ] Config file
+    - [ ] More advanced namespaces
+    - [x] Header
+        - [ ] Search bar in header
+    - [x] Footer
+- [x] Switchable CSS
+    - [x] CSS dark-theme
+    - [x] CSS light-theme
+- [x] Config file
 - [ ] Docker installation
    - [ ] Enable variables/environment variables
 - [ ] Logo
--- a/src/.gitignore
+++ b/src/.gitignore
@ -1 +1,2 @@
 __pycache__/
+indexdir/
--- a/src/app.py
+++ b/src/app.py
@ -1,21 +1,46 @@
+import os
 from flask import Flask, flash, make_response, render_template, request, redirect, abort

 import content as con_gen
 import config
-
+import search as src
+from forms import SearchForm, csrf

 app = Flask(__name__)

+SECRET_KEY = os.urandom(32)
+app.secret_key = SECRET_KEY
+csrf.init_app(app)
+
+TITLE = config.TITLE
+STYLE = config.STYLE
+

@app.errorhandler(404)
 def page_not_found(e):
-    return render_template('error.html', title=config.TITLE, errorcode='404', style=config.STYLE), 404
+    return render_template('error.html', title=TITLE, errorcode='404', style=STYLE), 404


@app.route('/')
@app.route('/index.html')
 def index():
-    return 'ok'
+    return 'placeholder for index', 200
+
+
+@app.route('/search', methods=['GET', 'POST'])
+def search():
+    form = SearchForm()
+    if request.method == 'POST':
+        query_str = request.form['query_str']
+        content = con_gen.gen_query_res_string(query_str)
+        return render_template('search.html', title=TITLE, style=STYLE, form=form, content=content), 200
+    return render_template('search.html', title=TITLE, style=STYLE, form=form, content=''), 200
+
+
+@app.route('/entry/<path:fullurl>')
+def entry(fullurl):
+    content = con_gen.gen_stand_string(fullurl)
+    return render_template('entry.html', title=TITLE, style=STYLE, content=content), 200


 if __name__ == '__main__':
--- a/src/config.py
+++ b/src/config.py
@ -6,3 +6,8 @@ WEBSITE = 'localhost:5000'

 # Theme for the blog: dark, light
 STYLE = 'dark'
+
+###############################################
+## NO CHANGES RECOMMENDED                    ##
+###############################################
+ENTRY_DIR = 'templates/entry'
--- a/src/content.py
+++ b/src/content.py
@ -1,10 +1,58 @@
+import config
+import search
+
 import datetime
 from datetime import datetime
-import markdown
 import os
 from os import path
 import pathlib

-import config
+import markdown

-ENTRY_DIR = 'templates/entry'
+ENTRY_DIR = config.ENTRY_DIR
+WEBSITE = config.WEBSITE
+
+
+def gen_stand_string(path_ex):
+    filename = os.path.join(ENTRY_DIR, path_ex)
+    content_string = ''
+    if path.exists(filename):
+        title = open(filename).readline().rstrip('\n')
+        text = open(filename).readlines()[1:]
+        filename_no_end = filename.split('.', 1)[0]
+        content_string += '<h1>' + title + '</h1>\n'
+        if filename.endswith('.md'):
+            content_string += gen_md_content(filename, 1)
+    return content_string
+
+
+def gen_md_content(path_ex, depth):
+    content_string = ''
+    if path.exists(path_ex):
+        filename = path_ex.split('.', 1)
+        fileend = filename[len(filename) - 1]
+        header = '#'
+        for i in range(depth):
+            header += '#'
+        header += ' '
+        markdown_lines = open(path_ex, "r").readlines()[1:]
+        markdown_text = ''
+        for line in markdown_lines:
+            markdown_text += line.replace('# ', header)
+        content_string = markdown.markdown(
+            markdown_text, extensions=["fenced_code", "tables"]
+        )
+    return content_string
+
+
+def gen_query_res_string(query_str):
+    src_results = search.search(query_str)
+    res_string = ''
+    res_string += '<ul>\n'
+    for result in src_results:
+        title = result['title']
+        path = result['path']
+        path = '/entry/' + path.split('/', 2)[2]
+        res_string += '<li><a href="' + path + '">' + title + '</a></li>'
+    res_string += '</ul>\n'
+    return res_string
--- a/src/forms.py
+++ b/src/forms.py
@ -0,0 +1,11 @@
+from flask_wtf import FlaskForm
+from flask_wtf import CSRFProtect
+from wtforms import TextField, SubmitField, ValidationError, validators
+
+csrf = CSRFProtect()
+
+
+class SearchForm(FlaskForm):
+    query_str = TextField(
+        "Query", [validators.Required("Please enter the search term")])
+    submit = SubmitField("Send")
--- a/src/requirements.txt
+++ b/src/requirements.txt
@ -1,2 +1,5 @@
-Flask==1.1.2
 Markdown==3.1.1
+WTForms==2.2.1
+Flask==1.1.2
+Flask_WTF==0.14.3
+Whoosh==2.7.4
--- a/src/search.py
+++ b/src/search.py
@ -0,0 +1,55 @@
+import config
+
+import os
+import sys
+
+from whoosh import scoring
+from whoosh.index import create_in, open_dir
+from whoosh.fields import Schema, TEXT, ID
+from whoosh.qparser import QueryParser
+
+INDEX_DIR = "indexdir"
+DEF_TOPN = 10
+ENTRY_DIR = config.ENTRY_DIR
+
+
+def createSearchableData(root):
+    '''
+    Schema definition: title(name of file), path(as ID), content(indexed but not stored), textdata (stored text content)
+    source:
+    https://appliedmachinelearning.blog/2018/07/31/developing-a-fast-indexing-and-full-text-search-engine-with-whoosh-a-pure-pythhon-library/
+    '''
+    schema = Schema(title=TEXT(stored=True),
+                    path=ID(stored=True), content=TEXT)
+    if not os.path.exists(INDEX_DIR):
+        os.mkdir(INDEX_DIR)
+    ix = create_in(INDEX_DIR, schema)
+    writer = ix.writer()
+    for r, d, f in os.walk(root):
+        for file in f:
+            path = os.path.join(r, file)
+            fp = open(path)
+            title = fp.readline()
+            text = title + fp.read()
+            writer.add_document(title=title, path=path, content=text)
+            fp.close()
+    writer.commit()
+
+
+def search_times(query_str, topN):
+    ix = open_dir(INDEX_DIR)
+    results = []
+    with ix.searcher(weighting=scoring.BM25F) as s:
+        query = QueryParser("content", ix.schema).parse(query_str)
+        matches = s.search(query, limit=topN)
+        for match in matches:
+            results.append(
+                {'title': match['title'], 'path': match['path'], 'match': match.score})
+    return results
+
+
+def search(query_str):
+    return search_times(query_str, DEF_TOPN)
+
+
+createSearchableData(ENTRY_DIR)
--- a/src/templates/entry.html
+++ b/src/templates/entry.html
@ -0,0 +1,10 @@
+{% extends "template.html" %}
+{% block content %}
+<div class="container">
+    <div class="content">
+        {% autoescape off %}
+	<span>{{ content }}</span>
+        {% endautoescape %}
+    </div>
+</div>
+{% endblock %}
--- a/src/templates/entry/namespace/test-entry4.md
+++ b/src/templates/entry/namespace/test-entry4.md
@ -0,0 +1,11 @@
+Test Entry Title 4
+This is a markdown file
+[link to entry3](../test-entry3.md)
+
+- list entry
+- list entry
+- list entry
+
+# md-header
+
+more content
--- a/src/templates/entry/namespace/test-entry5.md
+++ b/src/templates/entry/namespace/test-entry5.md
@ -0,0 +1,10 @@
+Test Entry Title 5
+This is a markdown file
+
+- list entry
+- list entry
+- list entry
+
+# md-header
+
+more content
--- a/src/templates/entry/namespace/test-entry6.md
+++ b/src/templates/entry/namespace/test-entry6.md
@ -0,0 +1,10 @@
+Test Entry Title 6
+This is a markdown file
+
+- list entry
+- list entry
+- list entry
+
+# md-header
+
+more content
--- a/src/templates/entry/test-entry1.md
+++ b/src/templates/entry/test-entry1.md
@ -1,5 +1,6 @@
-Test Entry Title 3
+Test Entry Title 1
 This is a markdown file
+This text contains a one.

 - list entry
 - list entry
--- a/src/templates/entry/test-entry2.md
+++ b/src/templates/entry/test-entry2.md
@ -1,5 +1,6 @@
-Test Entry Title 3
+Test Entry Title 2
 This is a markdown file
+Two Two

 - list entry
 - list entry
--- a/src/templates/entry/test-entry3.md
+++ b/src/templates/entry/test-entry3.md
@ -1,4 +1,4 @@
-Test Entry Title 3
+Test Entry Title 3 Three
 This is a markdown file

 - list entry
--- a/src/templates/search.html
+++ b/src/templates/search.html
@ -0,0 +1,15 @@
+{% extends "template.html" %}
+{% block content %}
+<div class="container">
+    <div class="content">
+        <form action="{{ url_for('search') }}" method=post>
+            {{ form.hidden_tag() }}
+            {{ form.query_str }}
+            {{ form.submit }}
+        </form>
+        {% autoescape off %}
+	<span>{{ content }}</span>
+        {% endautoescape %}
+    </div>
+</div>
+{% endblock %}
--- a/src/templates/template.html
+++ b/src/templates/template.html
@ -14,6 +14,7 @@
        <label for="main-menu-check" class="show-menu">&#9776;</label>
        <div class="main-menu">
            <a href="/">Startpage</a>
+            <a href="/search">Search</a>
            <label for="main-menu-check" class="hide-menu">X</label>
        </div>
    </div>
@ -24,7 +25,7 @@
    <!-- Content -->
    <footer>
        <div class="center">
-            Dieser Blog enthält kein Javascript oder PHP.<br>
+            Dieses Wiki enthält kein Javascript oder PHP.<br>
            Dies ist eine Instanz vom <a href="https://github.com/tiyn/tiyny-wiki">Tiyny-Wiki</a>.
        </div>
    </footer>