add further comments
This commit is contained in:
parent
4bfe48f20f
commit
cee2d548c6
1 changed files with 6 additions and 2 deletions
|
@ -7,12 +7,14 @@ from urllib.parse import unquote_plus
|
|||
|
||||
from exclude import EXCLUDED_WORDS
|
||||
|
||||
# Einstellungen
|
||||
SOURCE_DIR = Path('data')
|
||||
SOURCE_FILENAME = 'index.txt'
|
||||
OUTPUT_FILE = 'tags.json'
|
||||
TAGS_PER_ARTICLE = 5
|
||||
JSON_INDENT = 2
|
||||
|
||||
# Wegen Performance vordefinierte Variablen
|
||||
_UPPER_CHECK = re.compile(r'[A-Z]')
|
||||
|
||||
|
||||
|
@ -41,9 +43,11 @@ class FileScanner(HTMLParser):
|
|||
text = unquote_plus(text)
|
||||
# Textteile in einzelne Wörter aufteilen
|
||||
words += re.split(r'[ /\-_#\n.?=]', text)
|
||||
print(f'\nFile {self.file.parent.name} contains {len(words)} words')
|
||||
# Die Anzahl, der Wörter in der aktuellen Datei, auf der Konsole ausgeben
|
||||
title = self.file.parent.name
|
||||
print(f'\nFile {title} contains {len(words)} words')
|
||||
# Titel in einzelne Wörter aufteilen
|
||||
title_words = set(self.file.parent.name.split('-'))
|
||||
title_words = set(title.split('-'))
|
||||
for word in words:
|
||||
# Verschiedene Zeichen vom Anfang und Ende der Wörter entfernen.
|
||||
tag_name = word.strip(".,:;!\"'<>()")
|
||||
|
|
Loading…
Reference in a new issue