add further comments
This commit is contained in:
parent
4bfe48f20f
commit
cee2d548c6
|
@ -7,12 +7,14 @@ from urllib.parse import unquote_plus
|
||||||
|
|
||||||
from exclude import EXCLUDED_WORDS
|
from exclude import EXCLUDED_WORDS
|
||||||
|
|
||||||
|
# Einstellungen
|
||||||
SOURCE_DIR = Path('data')
|
SOURCE_DIR = Path('data')
|
||||||
SOURCE_FILENAME = 'index.txt'
|
SOURCE_FILENAME = 'index.txt'
|
||||||
OUTPUT_FILE = 'tags.json'
|
OUTPUT_FILE = 'tags.json'
|
||||||
TAGS_PER_ARTICLE = 5
|
TAGS_PER_ARTICLE = 5
|
||||||
JSON_INDENT = 2
|
JSON_INDENT = 2
|
||||||
|
|
||||||
|
# Wegen Performance vordefinierte Variablen
|
||||||
_UPPER_CHECK = re.compile(r'[A-Z]')
|
_UPPER_CHECK = re.compile(r'[A-Z]')
|
||||||
|
|
||||||
|
|
||||||
|
@ -41,9 +43,11 @@ class FileScanner(HTMLParser):
|
||||||
text = unquote_plus(text)
|
text = unquote_plus(text)
|
||||||
# Textteile in einzelne Wörter aufteilen
|
# Textteile in einzelne Wörter aufteilen
|
||||||
words += re.split(r'[ /\-_#\n.?=]', text)
|
words += re.split(r'[ /\-_#\n.?=]', text)
|
||||||
print(f'\nFile {self.file.parent.name} contains {len(words)} words')
|
# Die Anzahl, der Wörter in der aktuellen Datei, auf der Konsole ausgeben
|
||||||
|
title = self.file.parent.name
|
||||||
|
print(f'\nFile {title} contains {len(words)} words')
|
||||||
# Titel in einzelne Wörter aufteilen
|
# Titel in einzelne Wörter aufteilen
|
||||||
title_words = set(self.file.parent.name.split('-'))
|
title_words = set(title.split('-'))
|
||||||
for word in words:
|
for word in words:
|
||||||
# Verschiedene Zeichen vom Anfang und Ende der Wörter entfernen.
|
# Verschiedene Zeichen vom Anfang und Ende der Wörter entfernen.
|
||||||
tag_name = word.strip(".,:;!\"'<>()")
|
tag_name = word.strip(".,:;!\"'<>()")
|
||||||
|
|
Loading…
Reference in a new issue