add further comments

2022-05-28 19:39:24 +02:00 · 2022-05-28 19:39:24 +02:00 · cee2d548c6
commit cee2d548c6
parent 4bfe48f20f
1 changed files with 6 additions and 2 deletions
--- a/tagger.py
+++ b/tagger.py
@ -7,12 +7,14 @@ from urllib.parse import unquote_plus

 from exclude import EXCLUDED_WORDS

+# Einstellungen
 SOURCE_DIR = Path('data')
 SOURCE_FILENAME = 'index.txt'
 OUTPUT_FILE = 'tags.json'
 TAGS_PER_ARTICLE = 5
 JSON_INDENT = 2

+# Wegen Performance vordefinierte Variablen
 _UPPER_CHECK = re.compile(r'[A-Z]')


@ -41,9 +43,11 @@ class FileScanner(HTMLParser):
            text = unquote_plus(text)
            # Textteile in einzelne Wörter aufteilen
            words += re.split(r'[ /\-_#\n.?=]', text)
-        print(f'\nFile {self.file.parent.name} contains {len(words)} words')
+        # Die Anzahl, der Wörter in der aktuellen Datei, auf der Konsole ausgeben
+        title = self.file.parent.name
+        print(f'\nFile {title} contains {len(words)} words')
        # Titel in einzelne Wörter aufteilen
-        title_words = set(self.file.parent.name.split('-'))
+        title_words = set(title.split('-'))
        for word in words:
            # Verschiedene Zeichen vom Anfang und Ende der Wörter entfernen.
            tag_name = word.strip(".,:;!\"'<>()")