diff --git a/exclude.py b/exclude.py index 438cd39..553c591 100644 --- a/exclude.py +++ b/exclude.py @@ -10,21 +10,28 @@ EXCLUDED_WORDS = { "aus", "bei", "bis", + "chf", "dabei", "dafür", "damit", + "dank", "das", "dass", "davon", + "deine", + "deiner", "dem", "den", "der", "des", "die", "diese", + "diesem", "dir", + "doch", "dort", "du", + "eigenen", "ein", "ein", "eine", @@ -37,13 +44,17 @@ EXCLUDED_WORDS = { "es", "etwas", "euch", + "euro", "für", "gibt", "haben", "hat", + "heute", "hier", + "hinzu", "ich", "ihr", + "ihre", "im", "immer", "in", @@ -51,13 +62,16 @@ EXCLUDED_WORDS = { "ist", "jetzt", "kann", + "konnte", "man", "mehr", "mein", "meine", + "meiner", "mich", "mir", "mit", + "morgen", "nach", "nicht", "noch", @@ -73,37 +87,47 @@ EXCLUDED_WORDS = { "sind", "um", "und", + "uns", + "unter", + "viel", "viele", "von", "vor", "war", + "was", + "wenig", + "weniger", "wenn", + "wer", "werden", "wie", + "wieso", "wir", "wird", + "wo", + "wurde", + "wurden", "zu", "zum", "zur", "über", # Englische wörter - "and", "about", + "and", + "default", + "i", "in", "more", "much", + "no", "of", "or", + "that", + "the", "this", + "to", + "will", + "yes", + "you", "your", - # URL Bestandteile - "https", - "http", - "www", - "com", - "de", - "org", - "net", - "it", - "ch", } diff --git a/tagger.py b/tagger.py index 437d95c..83e1ec8 100644 --- a/tagger.py +++ b/tagger.py @@ -13,9 +13,11 @@ SOURCE_FILENAME = 'index.txt' OUTPUT_FILE = 'tags.json' TAGS_PER_ARTICLE = 5 JSON_INDENT = 2 +EXCLUDED_HTML_TAGS = {'code'} # Wegen Performance vordefinierte Variablen _UPPER_CHECK = re.compile(r'[A-Z]') +_LINK_PATTERN = re.compile(r'https?://\S+') @dataclass @@ -29,6 +31,8 @@ class FileScanner(HTMLParser): super().__init__() self.file = file self.texte = [] + self.links = [] + self._current_html_tag = None def scan_file(self): # Datei einlesen @@ -39,10 +43,8 @@ class FileScanner(HTMLParser): words_with_usage = {} words = [] for text in self.texte: - # Eventuelle URL-codierte Zeichen in die eigentliche Zeichen umwandeln. (z.B. %2F -> /) - text = unquote_plus(text) # Textteile in einzelne Wörter aufteilen - words += re.split(r'[ /\-_#\n.?=]', text) + words += re.split(r'[ \n/]', text) # Die Anzahl, der Wörter in der aktuellen Datei, auf der Konsole ausgeben title = self.file.parent.name print(f'\nFile {title} contains {len(words)} words') @@ -50,7 +52,7 @@ class FileScanner(HTMLParser): title_words = set(title.split('-')) for word in words: # Verschiedene Zeichen vom Anfang und Ende der Wörter entfernen. - tag_name = word.strip(".,:;!\"'<>()") + tag_name = word.strip(".,:;!?\"'()«»") # Leere Wörter ignorieren if not tag_name: continue @@ -82,21 +84,42 @@ class FileScanner(HTMLParser): words_with_usage[word] = Tag(name=tag_name, score=score) else: words_with_usage[word].score += score + + link_words = [] + for link in self.links: + # Eventuelle URL-codierte Zeichen in die eigentlichen Zeichen umwandeln. (z.B. %2F -> /) + link = unquote_plus(link) + # Link-Teile in einzelne Wörter aufteilen + words += re.split(r'[/\-_#.?&=]', link) + for link_word in link_words: + link_word = link_word.lower() + if link_word in words_with_usage: + words_with_usage[link_word] += 10 + # Die Wörter nach ihrer Bewertung sortieren return sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True) def handle_starttag(self, tag, attrs): + self._current_html_tag = tag # Die Links, die in den 'href' Attributen eines HTML-Elements stehen, mit einbeziehen. if tag != "a": return for attr_name, attr_value in attrs: if attr_name == "href": - self.texte.append(attr_value) + self.links.append(attr_value) break def handle_data(self, data): # Den Text innerhalb eines HTML-Elements mit einbeziehen. + if self._current_html_tag in EXCLUDED_HTML_TAGS: + return + + data = _LINK_PATTERN.sub(self._link_result, data) self.texte.append(data) + + def _link_result(self, link_match): + self.links.append(link_match.group(0)) + return '' def display_tags(tags, min_score):