This commit is contained in:
OneNewDev 2022-05-05 19:30:56 +02:00
parent 54d05d9849
commit a01d67dfa5
2 changed files with 383 additions and 26 deletions

315
exclude.py Normal file
View file

@ -0,0 +1,315 @@
EXCLUDED_WORDS = {
"dir",
"die",
"das",
"wird",
"werden",
"war",
"im",
"in",
"mit",
"ohne",
"schade",
"abfahren",
"aktivieren",
"anbeten",
"anhalten",
"antreffen",
"arbeiten",
"ärgern",
"aufatmen",
"austeilen",
"ausstellen",
"backen",
"baden",
"befehlen",
"beginnen",
"beissen",
"bejahen",
"belegen",
"bestimmen",
"bilden",
"bluten",
"bremsen",
"charakterisieren",
"chippen",
"codieren",
"covern",
"dableiben",
"dagegenhalten",
"dahinplätschern",
"dämmern",
"danken",
"decken",
"deklinieren",
"denken",
"deuten",
"dienen",
"dolmetschen",
"drucken",
"drücken",
"durchgeben",
"ehelichen",
"eifern",
"einbauen",
"einfallen",
"ekeln",
"emporblicken",
"entbinden",
"entriegeln",
"entwickeln",
"ergreifen",
"erziehen",
"essen",
"explodieren",
"fahren",
"fallen",
"fällen",
"fangen",
"fasten",
"feilen",
"festlegen",
"fiebern",
"fixieren",
"fliessen",
"folgen",
"fördern",
"freuen",
"funken",
"gackern",
"galoppieren",
"garantieren",
"gebrauchen",
"gedenken",
"genehmigen",
"geniessen",
"gleichen",
"glühen",
"garnieren",
"greifen",
"gründen",
"haben",
"hacken",
"halten",
"handeln",
"hassen",
"hauen",
"heften",
"heilen",
"herumlaufen",
"hoffen",
"honorieren",
"anzeigen",
"idealisieren",
"illuminieren",
"implizieren",
"infiltrieren",
"inserieren",
"investieren",
"irren",
"jagen",
"jammern",
"jauchzen",
"joggen",
"jubeln",
"justieren",
"kalkulieren",
"kaltmachen",
"kämmen",
"kämpfen",
"kapitulieren",
"kegeln",
"kellnern",
"kichern",
"klagen",
"klären",
"klumpen",
"knacken",
"konsumieren",
"kreisen",
"kurieren",
"labern",
"lachen",
"landen",
"lassen",
"leben",
"leeren",
"leihen",
"lenken",
"leuchten",
"liefern",
"loben",
"lohnen",
"losziehen",
"lüften",
"machen",
"malen",
"manipulieren",
"marschieren",
"mässigen",
"messen",
"mindern",
"mischen",
"mosern",
"mühen",
"nachbereiten",
"nachgucken",
"nächtigen",
"nähen",
"nähren",
"neiden",
"nerven",
"niedermachen",
"niesen",
"normalisieren",
"nötigen",
"nutzen",
"obsiegen",
"öden",
"offenbaren",
"ökonomisieren",
"ölen",
"operieren",
"ordnen",
"orten",
"paaren",
"pachten",
"packen",
"parken",
"passen",
"pauken",
"peitschen",
"personalisieren",
"pfeifen",
"pflegen",
"picken",
"planen",
"praktizieren",
"proben",
"protokollieren",
"quadrieren",
"quaken",
"quälen",
"qualifizieren",
"qualmen",
"quatschen",
"quengeln",
"querlegen",
"quietschen",
"quittieren",
"radieren",
"rahmen",
"rangieren",
"ranken",
"rankommen",
"raten",
"räumen",
"rechnen",
"reden",
"regeln",
"reichen",
"reinigen",
"reparieren",
"respektieren",
"rinnen",
"rollen",
"rosten",
"rückkehren",
"ruhen",
"rutschen",
"sabbern",
"sagen",
"sägen",
"salzen",
"saugen",
"schaben",
"schenken",
"schiessen",
"schlemmen",
"schlingern",
"schnappen",
"schnitzen",
"schwärzen",
"sehen",
"setzen",
"sichern",
"sprechen",
"stehen",
"strömen",
"studieren",
"tafeln",
"tagen",
"tanken",
"tauschen",
"teilen",
"telefonieren",
"testen",
"tieferlegen",
"tippen",
"töten",
"träumen",
"trinken",
"twittern",
"üben",
"überanstrengen",
"überbacken",
"umändern",
"umhören",
"unterbrechen",
"unternehmen",
"urteilen",
"vakuumieren",
"variieren",
"verabreden",
"verallgemeinern",
"verbinden",
"verderben",
"vergeben",
"verlangen",
"vertragen",
"vierteln",
"vollziehen",
"vorangehen",
"vorausahnen",
"vorbringen",
"voten",
"wachen",
"wagen",
"wählen",
"wahren",
"wallfahren",
"wandern",
"wärmen",
"wässern",
"weggehen",
"weichen",
"weitermachen",
"werben",
"wertschätzen",
"wichteln",
"widersprechen",
"wiederholen",
"wollen",
"wurzeln",
"zahlen",
"zahnen",
"zappeln",
"zaubern",
"zeichnen",
"zelten",
"zerdrücken",
"zeugen",
"ziehen",
"zieren",
"zischen",
"zivilisieren",
"zubereiten",
"zucken",
"zudecken",
"zurückweichen",
"zusammenleben",
"zustossen",
"zwingen",
}

View file

@ -1,33 +1,73 @@
from pathlib import Path
import re
from dataclasses import dataclass
from html.parser import HTMLParser
from pathlib import Path
from exclude import EXCLUDED_WORDS
upper_check = re.compile(r'[A-Z]')
def scan_file(file: Path):
content = read_file(file)
@dataclass
class Tag:
name: str
score: int
class FileScanner(HTMLParser):
def __init__(self, file: Path):
super().__init__()
self.file = file
self.texte = []
words_with_usage = {}
words = re.split(' /-_', content)
print(f'File {file.parent.name} contains {len(words)} words')
title_words = set(file.name.split('-'))
for word in words:
word = word.strip(" .,:;-_!\"'<>")
score = 10
if word in title_words:
score *= 4
if len(word) <= 3:
score //= 2
if word in words_with_usage:
words_with_usage[word] += score
else:
words_with_usage[word] = score
sorted_list = sorted(words_with_usage.items(), key=lambda item: item[1], reverse=True)
display_result(sorted_list)
def scan_file(self):
content = read_file(self.file)
self.feed(content)
words_with_usage = {}
words = []
for text in self.texte:
words += re.split(r'[ /\-_#\n]', text)
print(f'\nFile {self.file.parent.name} contains {len(words)} words')
title_words = set(self.file.parent.name.split('-'))
for word in words:
tag_name = word.strip(".,:;!\"'<>()")
word = tag_name.lower()
if not word:
continue
score = 10
if word in EXCLUDED_WORDS:
score = 0
if word in title_words:
score *= 4
if len(word) <= 3:
score //= 2
upper_letters_count = len(upper_check.findall(tag_name))
score *= upper_letters_count
if word not in words_with_usage:
words_with_usage[word] = Tag(name=tag_name, score=score)
else:
words_with_usage[word].score += score
sorted_list = sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=False)
display_result(sorted_list)
def handle_starttag(self, tag, attrs):
if tag != "a":
return
for attr_name, attr_value in attrs:
if attr_name == "href":
self.texte.append(attr_value)
break
def handle_data(self, data):
self.texte.append(data)
def display_result(result):
for word, usage in result:
if usage > 1:
print(f"Score: {usage:>3} Word: {word}")
for tag in result:
if tag.score <= 10:
continue
print(f"Score: {tag.score:>3} Word: {tag.name}")
def read_file(file: Path) -> str:
@ -36,9 +76,11 @@ def read_file(file: Path) -> str:
def main(source=Path('data')):
for file in source.glob('**/index.txt'):
scan_file(file)
break
for index, file in enumerate(source.glob('**/index.txt')):
scanner = FileScanner(file)
scanner.scan_file()
# if index == 3:
# break
if __name__ == '__main__':