diff --git a/exclude.py b/exclude.py index 553c591..438cd39 100644 --- a/exclude.py +++ b/exclude.py @@ -10,28 +10,21 @@ EXCLUDED_WORDS = { "aus", "bei", "bis", - "chf", "dabei", "dafür", "damit", - "dank", "das", "dass", "davon", - "deine", - "deiner", "dem", "den", "der", "des", "die", "diese", - "diesem", "dir", - "doch", "dort", "du", - "eigenen", "ein", "ein", "eine", @@ -44,17 +37,13 @@ EXCLUDED_WORDS = { "es", "etwas", "euch", - "euro", "für", "gibt", "haben", "hat", - "heute", "hier", - "hinzu", "ich", "ihr", - "ihre", "im", "immer", "in", @@ -62,16 +51,13 @@ EXCLUDED_WORDS = { "ist", "jetzt", "kann", - "konnte", "man", "mehr", "mein", "meine", - "meiner", "mich", "mir", "mit", - "morgen", "nach", "nicht", "noch", @@ -87,47 +73,37 @@ EXCLUDED_WORDS = { "sind", "um", "und", - "uns", - "unter", - "viel", "viele", "von", "vor", "war", - "was", - "wenig", - "weniger", "wenn", - "wer", "werden", "wie", - "wieso", "wir", "wird", - "wo", - "wurde", - "wurden", "zu", "zum", "zur", "über", # Englische wörter - "about", "and", - "default", - "i", + "about", "in", "more", "much", - "no", "of", "or", - "that", - "the", "this", - "to", - "will", - "yes", - "you", "your", + # URL Bestandteile + "https", + "http", + "www", + "com", + "de", + "org", + "net", + "it", + "ch", } diff --git a/tagger.py b/tagger.py index 83e1ec8..437d95c 100644 --- a/tagger.py +++ b/tagger.py @@ -13,11 +13,9 @@ SOURCE_FILENAME = 'index.txt' OUTPUT_FILE = 'tags.json' TAGS_PER_ARTICLE = 5 JSON_INDENT = 2 -EXCLUDED_HTML_TAGS = {'code'} # Wegen Performance vordefinierte Variablen _UPPER_CHECK = re.compile(r'[A-Z]') -_LINK_PATTERN = re.compile(r'https?://\S+') @dataclass @@ -31,8 +29,6 @@ class FileScanner(HTMLParser): super().__init__() self.file = file self.texte = [] - self.links = [] - self._current_html_tag = None def scan_file(self): # Datei einlesen @@ -43,8 +39,10 @@ class FileScanner(HTMLParser): words_with_usage = {} words = [] for text in self.texte: + # Eventuelle URL-codierte Zeichen in die eigentliche Zeichen umwandeln. (z.B. %2F -> /) + text = unquote_plus(text) # Textteile in einzelne Wörter aufteilen - words += re.split(r'[ \n/]', text) + words += re.split(r'[ /\-_#\n.?=]', text) # Die Anzahl, der Wörter in der aktuellen Datei, auf der Konsole ausgeben title = self.file.parent.name print(f'\nFile {title} contains {len(words)} words') @@ -52,7 +50,7 @@ class FileScanner(HTMLParser): title_words = set(title.split('-')) for word in words: # Verschiedene Zeichen vom Anfang und Ende der Wörter entfernen. - tag_name = word.strip(".,:;!?\"'()«»") + tag_name = word.strip(".,:;!\"'<>()") # Leere Wörter ignorieren if not tag_name: continue @@ -84,42 +82,21 @@ class FileScanner(HTMLParser): words_with_usage[word] = Tag(name=tag_name, score=score) else: words_with_usage[word].score += score - - link_words = [] - for link in self.links: - # Eventuelle URL-codierte Zeichen in die eigentlichen Zeichen umwandeln. (z.B. %2F -> /) - link = unquote_plus(link) - # Link-Teile in einzelne Wörter aufteilen - words += re.split(r'[/\-_#.?&=]', link) - for link_word in link_words: - link_word = link_word.lower() - if link_word in words_with_usage: - words_with_usage[link_word] += 10 - # Die Wörter nach ihrer Bewertung sortieren return sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True) def handle_starttag(self, tag, attrs): - self._current_html_tag = tag # Die Links, die in den 'href' Attributen eines HTML-Elements stehen, mit einbeziehen. if tag != "a": return for attr_name, attr_value in attrs: if attr_name == "href": - self.links.append(attr_value) + self.texte.append(attr_value) break def handle_data(self, data): # Den Text innerhalb eines HTML-Elements mit einbeziehen. - if self._current_html_tag in EXCLUDED_HTML_TAGS: - return - - data = _LINK_PATTERN.sub(self._link_result, data) self.texte.append(data) - - def _link_result(self, link_match): - self.links.append(link_match.group(0)) - return '' def display_tags(tags, min_score):