Compare commits
7 commits
6a68caca90
...
88717a1371
Author | SHA1 | Date | |
---|---|---|---|
88717a1371 | |||
e8ac8b49f4 | |||
e62f1f2cf9 | |||
20666e8e4c | |||
9eda3b29f8 | |||
710faa4611 | |||
0c6c621919 |
2 changed files with 63 additions and 16 deletions
46
exclude.py
46
exclude.py
|
@ -10,21 +10,28 @@ EXCLUDED_WORDS = {
|
|||
"aus",
|
||||
"bei",
|
||||
"bis",
|
||||
"chf",
|
||||
"dabei",
|
||||
"dafür",
|
||||
"damit",
|
||||
"dank",
|
||||
"das",
|
||||
"dass",
|
||||
"davon",
|
||||
"deine",
|
||||
"deiner",
|
||||
"dem",
|
||||
"den",
|
||||
"der",
|
||||
"des",
|
||||
"die",
|
||||
"diese",
|
||||
"diesem",
|
||||
"dir",
|
||||
"doch",
|
||||
"dort",
|
||||
"du",
|
||||
"eigenen",
|
||||
"ein",
|
||||
"ein",
|
||||
"eine",
|
||||
|
@ -37,13 +44,17 @@ EXCLUDED_WORDS = {
|
|||
"es",
|
||||
"etwas",
|
||||
"euch",
|
||||
"euro",
|
||||
"für",
|
||||
"gibt",
|
||||
"haben",
|
||||
"hat",
|
||||
"heute",
|
||||
"hier",
|
||||
"hinzu",
|
||||
"ich",
|
||||
"ihr",
|
||||
"ihre",
|
||||
"im",
|
||||
"immer",
|
||||
"in",
|
||||
|
@ -51,13 +62,16 @@ EXCLUDED_WORDS = {
|
|||
"ist",
|
||||
"jetzt",
|
||||
"kann",
|
||||
"konnte",
|
||||
"man",
|
||||
"mehr",
|
||||
"mein",
|
||||
"meine",
|
||||
"meiner",
|
||||
"mich",
|
||||
"mir",
|
||||
"mit",
|
||||
"morgen",
|
||||
"nach",
|
||||
"nicht",
|
||||
"noch",
|
||||
|
@ -73,37 +87,47 @@ EXCLUDED_WORDS = {
|
|||
"sind",
|
||||
"um",
|
||||
"und",
|
||||
"uns",
|
||||
"unter",
|
||||
"viel",
|
||||
"viele",
|
||||
"von",
|
||||
"vor",
|
||||
"war",
|
||||
"was",
|
||||
"wenig",
|
||||
"weniger",
|
||||
"wenn",
|
||||
"wer",
|
||||
"werden",
|
||||
"wie",
|
||||
"wieso",
|
||||
"wir",
|
||||
"wird",
|
||||
"wo",
|
||||
"wurde",
|
||||
"wurden",
|
||||
"zu",
|
||||
"zum",
|
||||
"zur",
|
||||
"über",
|
||||
# Englische wörter
|
||||
"and",
|
||||
"about",
|
||||
"and",
|
||||
"default",
|
||||
"i",
|
||||
"in",
|
||||
"more",
|
||||
"much",
|
||||
"no",
|
||||
"of",
|
||||
"or",
|
||||
"that",
|
||||
"the",
|
||||
"this",
|
||||
"to",
|
||||
"will",
|
||||
"yes",
|
||||
"you",
|
||||
"your",
|
||||
# URL Bestandteile
|
||||
"https",
|
||||
"http",
|
||||
"www",
|
||||
"com",
|
||||
"de",
|
||||
"org",
|
||||
"net",
|
||||
"it",
|
||||
"ch",
|
||||
}
|
||||
|
|
33
tagger.py
33
tagger.py
|
@ -13,9 +13,11 @@ SOURCE_FILENAME = 'index.txt'
|
|||
OUTPUT_FILE = 'tags.json'
|
||||
TAGS_PER_ARTICLE = 5
|
||||
JSON_INDENT = 2
|
||||
EXCLUDED_HTML_TAGS = {'code'}
|
||||
|
||||
# Wegen Performance vordefinierte Variablen
|
||||
_UPPER_CHECK = re.compile(r'[A-Z]')
|
||||
_LINK_PATTERN = re.compile(r'https?://\S+')
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -29,6 +31,8 @@ class FileScanner(HTMLParser):
|
|||
super().__init__()
|
||||
self.file = file
|
||||
self.texte = []
|
||||
self.links = []
|
||||
self._current_html_tag = None
|
||||
|
||||
def scan_file(self):
|
||||
# Datei einlesen
|
||||
|
@ -39,10 +43,8 @@ class FileScanner(HTMLParser):
|
|||
words_with_usage = {}
|
||||
words = []
|
||||
for text in self.texte:
|
||||
# Eventuelle URL-codierte Zeichen in die eigentliche Zeichen umwandeln. (z.B. %2F -> /)
|
||||
text = unquote_plus(text)
|
||||
# Textteile in einzelne Wörter aufteilen
|
||||
words += re.split(r'[ /\-_#\n.?=]', text)
|
||||
words += re.split(r'[ \n/]', text)
|
||||
# Die Anzahl, der Wörter in der aktuellen Datei, auf der Konsole ausgeben
|
||||
title = self.file.parent.name
|
||||
print(f'\nFile {title} contains {len(words)} words')
|
||||
|
@ -50,7 +52,7 @@ class FileScanner(HTMLParser):
|
|||
title_words = set(title.split('-'))
|
||||
for word in words:
|
||||
# Verschiedene Zeichen vom Anfang und Ende der Wörter entfernen.
|
||||
tag_name = word.strip(".,:;!\"'<>()")
|
||||
tag_name = word.strip(".,:;!?\"'()«»")
|
||||
# Leere Wörter ignorieren
|
||||
if not tag_name:
|
||||
continue
|
||||
|
@ -82,21 +84,42 @@ class FileScanner(HTMLParser):
|
|||
words_with_usage[word] = Tag(name=tag_name, score=score)
|
||||
else:
|
||||
words_with_usage[word].score += score
|
||||
|
||||
link_words = []
|
||||
for link in self.links:
|
||||
# Eventuelle URL-codierte Zeichen in die eigentlichen Zeichen umwandeln. (z.B. %2F -> /)
|
||||
link = unquote_plus(link)
|
||||
# Link-Teile in einzelne Wörter aufteilen
|
||||
words += re.split(r'[/\-_#.?&=]', link)
|
||||
for link_word in link_words:
|
||||
link_word = link_word.lower()
|
||||
if link_word in words_with_usage:
|
||||
words_with_usage[link_word] += 10
|
||||
|
||||
# Die Wörter nach ihrer Bewertung sortieren
|
||||
return sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True)
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
self._current_html_tag = tag
|
||||
# Die Links, die in den 'href' Attributen eines <a> HTML-Elements stehen, mit einbeziehen.
|
||||
if tag != "a":
|
||||
return
|
||||
for attr_name, attr_value in attrs:
|
||||
if attr_name == "href":
|
||||
self.texte.append(attr_value)
|
||||
self.links.append(attr_value)
|
||||
break
|
||||
|
||||
def handle_data(self, data):
|
||||
# Den Text innerhalb eines HTML-Elements mit einbeziehen.
|
||||
if self._current_html_tag in EXCLUDED_HTML_TAGS:
|
||||
return
|
||||
|
||||
data = _LINK_PATTERN.sub(self._link_result, data)
|
||||
self.texte.append(data)
|
||||
|
||||
def _link_result(self, link_match):
|
||||
self.links.append(link_match.group(0))
|
||||
return ''
|
||||
|
||||
|
||||
def display_tags(tags, min_score):
|
||||
|
|
Loading…
Reference in a new issue