diff --git a/exclude.py b/exclude.py new file mode 100644 index 0000000..0be1fed --- /dev/null +++ b/exclude.py @@ -0,0 +1,315 @@ +EXCLUDED_WORDS = { + "dir", + "die", + "das", + "wird", + "werden", + "war", + "im", + "in", + "mit", + "ohne", + "schade", + + + "abfahren", + "aktivieren", + "anbeten", + "anhalten", + "antreffen", + "arbeiten", + "ärgern", + "aufatmen", + "austeilen", + "ausstellen", + "backen", + "baden", + "befehlen", + "beginnen", + "beissen", + "bejahen", + "belegen", + "bestimmen", + "bilden", + "bluten", + "bremsen", + "charakterisieren", + "chippen", + "codieren", + "covern", + "dableiben", + "dagegenhalten", + "dahinplätschern", + "dämmern", + "danken", + "decken", + "deklinieren", + "denken", + "deuten", + "dienen", + "dolmetschen", + "drucken", + "drücken", + "durchgeben", + "ehelichen", + "eifern", + "einbauen", + "einfallen", + "ekeln", + "emporblicken", + "entbinden", + "entriegeln", + "entwickeln", + "ergreifen", + "erziehen", + "essen", + "explodieren", + "fahren", + "fallen", + "fällen", + "fangen", + "fasten", + "feilen", + "festlegen", + "fiebern", + "fixieren", + "fliessen", + "folgen", + "fördern", + "freuen", + "funken", + "gackern", + "galoppieren", + "garantieren", + "gebrauchen", + "gedenken", + "genehmigen", + "geniessen", + "gleichen", + "glühen", + "garnieren", + "greifen", + "gründen", + "haben", + "hacken", + "halten", + "handeln", + "hassen", + "hauen", + "heften", + "heilen", + "herumlaufen", + "hoffen", + "honorieren", + "anzeigen", + "idealisieren", + "illuminieren", + "implizieren", + "infiltrieren", + "inserieren", + "investieren", + "irren", + "jagen", + "jammern", + "jauchzen", + "joggen", + "jubeln", + "justieren", + "kalkulieren", + "kaltmachen", + "kämmen", + "kämpfen", + "kapitulieren", + "kegeln", + "kellnern", + "kichern", + "klagen", + "klären", + "klumpen", + "knacken", + "konsumieren", + "kreisen", + "kurieren", + "labern", + "lachen", + "landen", + "lassen", + "leben", + "leeren", + "leihen", + "lenken", + "leuchten", + "liefern", + "loben", + "lohnen", + "losziehen", + "lüften", + "machen", + "malen", + "manipulieren", + "marschieren", + "mässigen", + "messen", + "mindern", + "mischen", + "mosern", + "mühen", + "nachbereiten", + "nachgucken", + "nächtigen", + "nähen", + "nähren", + "neiden", + "nerven", + "niedermachen", + "niesen", + "normalisieren", + "nötigen", + "nutzen", + "obsiegen", + "öden", + "offenbaren", + "ökonomisieren", + "ölen", + "operieren", + "ordnen", + "orten", + "paaren", + "pachten", + "packen", + "parken", + "passen", + "pauken", + "peitschen", + "personalisieren", + "pfeifen", + "pflegen", + "picken", + "planen", + "praktizieren", + "proben", + "protokollieren", + "quadrieren", + "quaken", + "quälen", + "qualifizieren", + "qualmen", + "quatschen", + "quengeln", + "querlegen", + "quietschen", + "quittieren", + "radieren", + "rahmen", + "rangieren", + "ranken", + "rankommen", + "raten", + "räumen", + "rechnen", + "reden", + "regeln", + "reichen", + "reinigen", + "reparieren", + "respektieren", + "rinnen", + "rollen", + "rosten", + "rückkehren", + "ruhen", + "rutschen", + "sabbern", + "sagen", + "sägen", + "salzen", + "saugen", + "schaben", + "schenken", + "schiessen", + "schlemmen", + "schlingern", + "schnappen", + "schnitzen", + "schwärzen", + "sehen", + "setzen", + "sichern", + "sprechen", + "stehen", + "strömen", + "studieren", + "tafeln", + "tagen", + "tanken", + "tauschen", + "teilen", + "telefonieren", + "testen", + "tieferlegen", + "tippen", + "töten", + "träumen", + "trinken", + "twittern", + "üben", + "überanstrengen", + "überbacken", + "umändern", + "umhören", + "unterbrechen", + "unternehmen", + "urteilen", + "vakuumieren", + "variieren", + "verabreden", + "verallgemeinern", + "verbinden", + "verderben", + "vergeben", + "verlangen", + "vertragen", + "vierteln", + "vollziehen", + "vorangehen", + "vorausahnen", + "vorbringen", + "voten", + "wachen", + "wagen", + "wählen", + "wahren", + "wallfahren", + "wandern", + "wärmen", + "wässern", + "weggehen", + "weichen", + "weitermachen", + "werben", + "wertschätzen", + "wichteln", + "widersprechen", + "wiederholen", + "wollen", + "wurzeln", + "zahlen", + "zahnen", + "zappeln", + "zaubern", + "zeichnen", + "zelten", + "zerdrücken", + "zeugen", + "ziehen", + "zieren", + "zischen", + "zivilisieren", + "zubereiten", + "zucken", + "zudecken", + "zurückweichen", + "zusammenleben", + "zustossen", + "zwingen", +} diff --git a/tagger.py b/tagger.py index e84d0fa..f3889d3 100644 --- a/tagger.py +++ b/tagger.py @@ -1,33 +1,73 @@ -from pathlib import Path import re +from dataclasses import dataclass +from html.parser import HTMLParser +from pathlib import Path + +from exclude import EXCLUDED_WORDS + +upper_check = re.compile(r'[A-Z]') -def scan_file(file: Path): - content = read_file(file) +@dataclass +class Tag: + name: str + score: int + + +class FileScanner(HTMLParser): + def __init__(self, file: Path): + super().__init__() + self.file = file + self.texte = [] - words_with_usage = {} - words = re.split(' /-_', content) - print(f'File {file.parent.name} contains {len(words)} words') - title_words = set(file.name.split('-')) - for word in words: - word = word.strip(" .,:;-_!\"'<>") - score = 10 - if word in title_words: - score *= 4 - if len(word) <= 3: - score //= 2 - if word in words_with_usage: - words_with_usage[word] += score - else: - words_with_usage[word] = score - sorted_list = sorted(words_with_usage.items(), key=lambda item: item[1], reverse=True) - display_result(sorted_list) + def scan_file(self): + content = read_file(self.file) + self.feed(content) + + words_with_usage = {} + words = [] + for text in self.texte: + words += re.split(r'[ /\-_#\n]', text) + print(f'\nFile {self.file.parent.name} contains {len(words)} words') + title_words = set(self.file.parent.name.split('-')) + for word in words: + tag_name = word.strip(".,:;!\"'<>()") + word = tag_name.lower() + if not word: + continue + score = 10 + if word in EXCLUDED_WORDS: + score = 0 + if word in title_words: + score *= 4 + if len(word) <= 3: + score //= 2 + upper_letters_count = len(upper_check.findall(tag_name)) + score *= upper_letters_count + if word not in words_with_usage: + words_with_usage[word] = Tag(name=tag_name, score=score) + else: + words_with_usage[word].score += score + sorted_list = sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=False) + display_result(sorted_list) + + def handle_starttag(self, tag, attrs): + if tag != "a": + return + for attr_name, attr_value in attrs: + if attr_name == "href": + self.texte.append(attr_value) + break + + def handle_data(self, data): + self.texte.append(data) def display_result(result): - for word, usage in result: - if usage > 1: - print(f"Score: {usage:>3} Word: {word}") + for tag in result: + if tag.score <= 10: + continue + print(f"Score: {tag.score:>3} Word: {tag.name}") def read_file(file: Path) -> str: @@ -36,9 +76,11 @@ def read_file(file: Path) -> str: def main(source=Path('data')): - for file in source.glob('**/index.txt'): - scan_file(file) - break + for index, file in enumerate(source.glob('**/index.txt')): + scanner = FileScanner(file) + scanner.scan_file() + # if index == 3: + # break if __name__ == '__main__':