Compare commits
No commits in common. "88717a137148384da5c4357c92740cc14d8a4cc7" and "6a68caca90abe999fca6f6f275442137a193c527" have entirely different histories.
88717a1371
...
6a68caca90
46
exclude.py
46
exclude.py
|
@ -10,28 +10,21 @@ EXCLUDED_WORDS = {
|
||||||
"aus",
|
"aus",
|
||||||
"bei",
|
"bei",
|
||||||
"bis",
|
"bis",
|
||||||
"chf",
|
|
||||||
"dabei",
|
"dabei",
|
||||||
"dafür",
|
"dafür",
|
||||||
"damit",
|
"damit",
|
||||||
"dank",
|
|
||||||
"das",
|
"das",
|
||||||
"dass",
|
"dass",
|
||||||
"davon",
|
"davon",
|
||||||
"deine",
|
|
||||||
"deiner",
|
|
||||||
"dem",
|
"dem",
|
||||||
"den",
|
"den",
|
||||||
"der",
|
"der",
|
||||||
"des",
|
"des",
|
||||||
"die",
|
"die",
|
||||||
"diese",
|
"diese",
|
||||||
"diesem",
|
|
||||||
"dir",
|
"dir",
|
||||||
"doch",
|
|
||||||
"dort",
|
"dort",
|
||||||
"du",
|
"du",
|
||||||
"eigenen",
|
|
||||||
"ein",
|
"ein",
|
||||||
"ein",
|
"ein",
|
||||||
"eine",
|
"eine",
|
||||||
|
@ -44,17 +37,13 @@ EXCLUDED_WORDS = {
|
||||||
"es",
|
"es",
|
||||||
"etwas",
|
"etwas",
|
||||||
"euch",
|
"euch",
|
||||||
"euro",
|
|
||||||
"für",
|
"für",
|
||||||
"gibt",
|
"gibt",
|
||||||
"haben",
|
"haben",
|
||||||
"hat",
|
"hat",
|
||||||
"heute",
|
|
||||||
"hier",
|
"hier",
|
||||||
"hinzu",
|
|
||||||
"ich",
|
"ich",
|
||||||
"ihr",
|
"ihr",
|
||||||
"ihre",
|
|
||||||
"im",
|
"im",
|
||||||
"immer",
|
"immer",
|
||||||
"in",
|
"in",
|
||||||
|
@ -62,16 +51,13 @@ EXCLUDED_WORDS = {
|
||||||
"ist",
|
"ist",
|
||||||
"jetzt",
|
"jetzt",
|
||||||
"kann",
|
"kann",
|
||||||
"konnte",
|
|
||||||
"man",
|
"man",
|
||||||
"mehr",
|
"mehr",
|
||||||
"mein",
|
"mein",
|
||||||
"meine",
|
"meine",
|
||||||
"meiner",
|
|
||||||
"mich",
|
"mich",
|
||||||
"mir",
|
"mir",
|
||||||
"mit",
|
"mit",
|
||||||
"morgen",
|
|
||||||
"nach",
|
"nach",
|
||||||
"nicht",
|
"nicht",
|
||||||
"noch",
|
"noch",
|
||||||
|
@ -87,47 +73,37 @@ EXCLUDED_WORDS = {
|
||||||
"sind",
|
"sind",
|
||||||
"um",
|
"um",
|
||||||
"und",
|
"und",
|
||||||
"uns",
|
|
||||||
"unter",
|
|
||||||
"viel",
|
|
||||||
"viele",
|
"viele",
|
||||||
"von",
|
"von",
|
||||||
"vor",
|
"vor",
|
||||||
"war",
|
"war",
|
||||||
"was",
|
|
||||||
"wenig",
|
|
||||||
"weniger",
|
|
||||||
"wenn",
|
"wenn",
|
||||||
"wer",
|
|
||||||
"werden",
|
"werden",
|
||||||
"wie",
|
"wie",
|
||||||
"wieso",
|
|
||||||
"wir",
|
"wir",
|
||||||
"wird",
|
"wird",
|
||||||
"wo",
|
|
||||||
"wurde",
|
|
||||||
"wurden",
|
|
||||||
"zu",
|
"zu",
|
||||||
"zum",
|
"zum",
|
||||||
"zur",
|
"zur",
|
||||||
"über",
|
"über",
|
||||||
# Englische wörter
|
# Englische wörter
|
||||||
"about",
|
|
||||||
"and",
|
"and",
|
||||||
"default",
|
"about",
|
||||||
"i",
|
|
||||||
"in",
|
"in",
|
||||||
"more",
|
"more",
|
||||||
"much",
|
"much",
|
||||||
"no",
|
|
||||||
"of",
|
"of",
|
||||||
"or",
|
"or",
|
||||||
"that",
|
|
||||||
"the",
|
|
||||||
"this",
|
"this",
|
||||||
"to",
|
|
||||||
"will",
|
|
||||||
"yes",
|
|
||||||
"you",
|
|
||||||
"your",
|
"your",
|
||||||
|
# URL Bestandteile
|
||||||
|
"https",
|
||||||
|
"http",
|
||||||
|
"www",
|
||||||
|
"com",
|
||||||
|
"de",
|
||||||
|
"org",
|
||||||
|
"net",
|
||||||
|
"it",
|
||||||
|
"ch",
|
||||||
}
|
}
|
||||||
|
|
33
tagger.py
33
tagger.py
|
@ -13,11 +13,9 @@ SOURCE_FILENAME = 'index.txt'
|
||||||
OUTPUT_FILE = 'tags.json'
|
OUTPUT_FILE = 'tags.json'
|
||||||
TAGS_PER_ARTICLE = 5
|
TAGS_PER_ARTICLE = 5
|
||||||
JSON_INDENT = 2
|
JSON_INDENT = 2
|
||||||
EXCLUDED_HTML_TAGS = {'code'}
|
|
||||||
|
|
||||||
# Wegen Performance vordefinierte Variablen
|
# Wegen Performance vordefinierte Variablen
|
||||||
_UPPER_CHECK = re.compile(r'[A-Z]')
|
_UPPER_CHECK = re.compile(r'[A-Z]')
|
||||||
_LINK_PATTERN = re.compile(r'https?://\S+')
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -31,8 +29,6 @@ class FileScanner(HTMLParser):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.file = file
|
self.file = file
|
||||||
self.texte = []
|
self.texte = []
|
||||||
self.links = []
|
|
||||||
self._current_html_tag = None
|
|
||||||
|
|
||||||
def scan_file(self):
|
def scan_file(self):
|
||||||
# Datei einlesen
|
# Datei einlesen
|
||||||
|
@ -43,8 +39,10 @@ class FileScanner(HTMLParser):
|
||||||
words_with_usage = {}
|
words_with_usage = {}
|
||||||
words = []
|
words = []
|
||||||
for text in self.texte:
|
for text in self.texte:
|
||||||
|
# Eventuelle URL-codierte Zeichen in die eigentliche Zeichen umwandeln. (z.B. %2F -> /)
|
||||||
|
text = unquote_plus(text)
|
||||||
# Textteile in einzelne Wörter aufteilen
|
# Textteile in einzelne Wörter aufteilen
|
||||||
words += re.split(r'[ \n/]', text)
|
words += re.split(r'[ /\-_#\n.?=]', text)
|
||||||
# Die Anzahl, der Wörter in der aktuellen Datei, auf der Konsole ausgeben
|
# Die Anzahl, der Wörter in der aktuellen Datei, auf der Konsole ausgeben
|
||||||
title = self.file.parent.name
|
title = self.file.parent.name
|
||||||
print(f'\nFile {title} contains {len(words)} words')
|
print(f'\nFile {title} contains {len(words)} words')
|
||||||
|
@ -52,7 +50,7 @@ class FileScanner(HTMLParser):
|
||||||
title_words = set(title.split('-'))
|
title_words = set(title.split('-'))
|
||||||
for word in words:
|
for word in words:
|
||||||
# Verschiedene Zeichen vom Anfang und Ende der Wörter entfernen.
|
# Verschiedene Zeichen vom Anfang und Ende der Wörter entfernen.
|
||||||
tag_name = word.strip(".,:;!?\"'()«»")
|
tag_name = word.strip(".,:;!\"'<>()")
|
||||||
# Leere Wörter ignorieren
|
# Leere Wörter ignorieren
|
||||||
if not tag_name:
|
if not tag_name:
|
||||||
continue
|
continue
|
||||||
|
@ -84,42 +82,21 @@ class FileScanner(HTMLParser):
|
||||||
words_with_usage[word] = Tag(name=tag_name, score=score)
|
words_with_usage[word] = Tag(name=tag_name, score=score)
|
||||||
else:
|
else:
|
||||||
words_with_usage[word].score += score
|
words_with_usage[word].score += score
|
||||||
|
|
||||||
link_words = []
|
|
||||||
for link in self.links:
|
|
||||||
# Eventuelle URL-codierte Zeichen in die eigentlichen Zeichen umwandeln. (z.B. %2F -> /)
|
|
||||||
link = unquote_plus(link)
|
|
||||||
# Link-Teile in einzelne Wörter aufteilen
|
|
||||||
words += re.split(r'[/\-_#.?&=]', link)
|
|
||||||
for link_word in link_words:
|
|
||||||
link_word = link_word.lower()
|
|
||||||
if link_word in words_with_usage:
|
|
||||||
words_with_usage[link_word] += 10
|
|
||||||
|
|
||||||
# Die Wörter nach ihrer Bewertung sortieren
|
# Die Wörter nach ihrer Bewertung sortieren
|
||||||
return sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True)
|
return sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True)
|
||||||
|
|
||||||
def handle_starttag(self, tag, attrs):
|
def handle_starttag(self, tag, attrs):
|
||||||
self._current_html_tag = tag
|
|
||||||
# Die Links, die in den 'href' Attributen eines <a> HTML-Elements stehen, mit einbeziehen.
|
# Die Links, die in den 'href' Attributen eines <a> HTML-Elements stehen, mit einbeziehen.
|
||||||
if tag != "a":
|
if tag != "a":
|
||||||
return
|
return
|
||||||
for attr_name, attr_value in attrs:
|
for attr_name, attr_value in attrs:
|
||||||
if attr_name == "href":
|
if attr_name == "href":
|
||||||
self.links.append(attr_value)
|
self.texte.append(attr_value)
|
||||||
break
|
break
|
||||||
|
|
||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
# Den Text innerhalb eines HTML-Elements mit einbeziehen.
|
# Den Text innerhalb eines HTML-Elements mit einbeziehen.
|
||||||
if self._current_html_tag in EXCLUDED_HTML_TAGS:
|
|
||||||
return
|
|
||||||
|
|
||||||
data = _LINK_PATTERN.sub(self._link_result, data)
|
|
||||||
self.texte.append(data)
|
self.texte.append(data)
|
||||||
|
|
||||||
def _link_result(self, link_match):
|
|
||||||
self.links.append(link_match.group(0))
|
|
||||||
return ''
|
|
||||||
|
|
||||||
|
|
||||||
def display_tags(tags, min_score):
|
def display_tags(tags, min_score):
|
||||||
|
|
Loading…
Reference in a new issue