commit
This commit is contained in:
parent
54d05d9849
commit
a01d67dfa5
2 changed files with 383 additions and 26 deletions
315
exclude.py
Normal file
315
exclude.py
Normal file
|
@ -0,0 +1,315 @@
|
|||
EXCLUDED_WORDS = {
|
||||
"dir",
|
||||
"die",
|
||||
"das",
|
||||
"wird",
|
||||
"werden",
|
||||
"war",
|
||||
"im",
|
||||
"in",
|
||||
"mit",
|
||||
"ohne",
|
||||
"schade",
|
||||
|
||||
|
||||
"abfahren",
|
||||
"aktivieren",
|
||||
"anbeten",
|
||||
"anhalten",
|
||||
"antreffen",
|
||||
"arbeiten",
|
||||
"ärgern",
|
||||
"aufatmen",
|
||||
"austeilen",
|
||||
"ausstellen",
|
||||
"backen",
|
||||
"baden",
|
||||
"befehlen",
|
||||
"beginnen",
|
||||
"beissen",
|
||||
"bejahen",
|
||||
"belegen",
|
||||
"bestimmen",
|
||||
"bilden",
|
||||
"bluten",
|
||||
"bremsen",
|
||||
"charakterisieren",
|
||||
"chippen",
|
||||
"codieren",
|
||||
"covern",
|
||||
"dableiben",
|
||||
"dagegenhalten",
|
||||
"dahinplätschern",
|
||||
"dämmern",
|
||||
"danken",
|
||||
"decken",
|
||||
"deklinieren",
|
||||
"denken",
|
||||
"deuten",
|
||||
"dienen",
|
||||
"dolmetschen",
|
||||
"drucken",
|
||||
"drücken",
|
||||
"durchgeben",
|
||||
"ehelichen",
|
||||
"eifern",
|
||||
"einbauen",
|
||||
"einfallen",
|
||||
"ekeln",
|
||||
"emporblicken",
|
||||
"entbinden",
|
||||
"entriegeln",
|
||||
"entwickeln",
|
||||
"ergreifen",
|
||||
"erziehen",
|
||||
"essen",
|
||||
"explodieren",
|
||||
"fahren",
|
||||
"fallen",
|
||||
"fällen",
|
||||
"fangen",
|
||||
"fasten",
|
||||
"feilen",
|
||||
"festlegen",
|
||||
"fiebern",
|
||||
"fixieren",
|
||||
"fliessen",
|
||||
"folgen",
|
||||
"fördern",
|
||||
"freuen",
|
||||
"funken",
|
||||
"gackern",
|
||||
"galoppieren",
|
||||
"garantieren",
|
||||
"gebrauchen",
|
||||
"gedenken",
|
||||
"genehmigen",
|
||||
"geniessen",
|
||||
"gleichen",
|
||||
"glühen",
|
||||
"garnieren",
|
||||
"greifen",
|
||||
"gründen",
|
||||
"haben",
|
||||
"hacken",
|
||||
"halten",
|
||||
"handeln",
|
||||
"hassen",
|
||||
"hauen",
|
||||
"heften",
|
||||
"heilen",
|
||||
"herumlaufen",
|
||||
"hoffen",
|
||||
"honorieren",
|
||||
"anzeigen",
|
||||
"idealisieren",
|
||||
"illuminieren",
|
||||
"implizieren",
|
||||
"infiltrieren",
|
||||
"inserieren",
|
||||
"investieren",
|
||||
"irren",
|
||||
"jagen",
|
||||
"jammern",
|
||||
"jauchzen",
|
||||
"joggen",
|
||||
"jubeln",
|
||||
"justieren",
|
||||
"kalkulieren",
|
||||
"kaltmachen",
|
||||
"kämmen",
|
||||
"kämpfen",
|
||||
"kapitulieren",
|
||||
"kegeln",
|
||||
"kellnern",
|
||||
"kichern",
|
||||
"klagen",
|
||||
"klären",
|
||||
"klumpen",
|
||||
"knacken",
|
||||
"konsumieren",
|
||||
"kreisen",
|
||||
"kurieren",
|
||||
"labern",
|
||||
"lachen",
|
||||
"landen",
|
||||
"lassen",
|
||||
"leben",
|
||||
"leeren",
|
||||
"leihen",
|
||||
"lenken",
|
||||
"leuchten",
|
||||
"liefern",
|
||||
"loben",
|
||||
"lohnen",
|
||||
"losziehen",
|
||||
"lüften",
|
||||
"machen",
|
||||
"malen",
|
||||
"manipulieren",
|
||||
"marschieren",
|
||||
"mässigen",
|
||||
"messen",
|
||||
"mindern",
|
||||
"mischen",
|
||||
"mosern",
|
||||
"mühen",
|
||||
"nachbereiten",
|
||||
"nachgucken",
|
||||
"nächtigen",
|
||||
"nähen",
|
||||
"nähren",
|
||||
"neiden",
|
||||
"nerven",
|
||||
"niedermachen",
|
||||
"niesen",
|
||||
"normalisieren",
|
||||
"nötigen",
|
||||
"nutzen",
|
||||
"obsiegen",
|
||||
"öden",
|
||||
"offenbaren",
|
||||
"ökonomisieren",
|
||||
"ölen",
|
||||
"operieren",
|
||||
"ordnen",
|
||||
"orten",
|
||||
"paaren",
|
||||
"pachten",
|
||||
"packen",
|
||||
"parken",
|
||||
"passen",
|
||||
"pauken",
|
||||
"peitschen",
|
||||
"personalisieren",
|
||||
"pfeifen",
|
||||
"pflegen",
|
||||
"picken",
|
||||
"planen",
|
||||
"praktizieren",
|
||||
"proben",
|
||||
"protokollieren",
|
||||
"quadrieren",
|
||||
"quaken",
|
||||
"quälen",
|
||||
"qualifizieren",
|
||||
"qualmen",
|
||||
"quatschen",
|
||||
"quengeln",
|
||||
"querlegen",
|
||||
"quietschen",
|
||||
"quittieren",
|
||||
"radieren",
|
||||
"rahmen",
|
||||
"rangieren",
|
||||
"ranken",
|
||||
"rankommen",
|
||||
"raten",
|
||||
"räumen",
|
||||
"rechnen",
|
||||
"reden",
|
||||
"regeln",
|
||||
"reichen",
|
||||
"reinigen",
|
||||
"reparieren",
|
||||
"respektieren",
|
||||
"rinnen",
|
||||
"rollen",
|
||||
"rosten",
|
||||
"rückkehren",
|
||||
"ruhen",
|
||||
"rutschen",
|
||||
"sabbern",
|
||||
"sagen",
|
||||
"sägen",
|
||||
"salzen",
|
||||
"saugen",
|
||||
"schaben",
|
||||
"schenken",
|
||||
"schiessen",
|
||||
"schlemmen",
|
||||
"schlingern",
|
||||
"schnappen",
|
||||
"schnitzen",
|
||||
"schwärzen",
|
||||
"sehen",
|
||||
"setzen",
|
||||
"sichern",
|
||||
"sprechen",
|
||||
"stehen",
|
||||
"strömen",
|
||||
"studieren",
|
||||
"tafeln",
|
||||
"tagen",
|
||||
"tanken",
|
||||
"tauschen",
|
||||
"teilen",
|
||||
"telefonieren",
|
||||
"testen",
|
||||
"tieferlegen",
|
||||
"tippen",
|
||||
"töten",
|
||||
"träumen",
|
||||
"trinken",
|
||||
"twittern",
|
||||
"üben",
|
||||
"überanstrengen",
|
||||
"überbacken",
|
||||
"umändern",
|
||||
"umhören",
|
||||
"unterbrechen",
|
||||
"unternehmen",
|
||||
"urteilen",
|
||||
"vakuumieren",
|
||||
"variieren",
|
||||
"verabreden",
|
||||
"verallgemeinern",
|
||||
"verbinden",
|
||||
"verderben",
|
||||
"vergeben",
|
||||
"verlangen",
|
||||
"vertragen",
|
||||
"vierteln",
|
||||
"vollziehen",
|
||||
"vorangehen",
|
||||
"vorausahnen",
|
||||
"vorbringen",
|
||||
"voten",
|
||||
"wachen",
|
||||
"wagen",
|
||||
"wählen",
|
||||
"wahren",
|
||||
"wallfahren",
|
||||
"wandern",
|
||||
"wärmen",
|
||||
"wässern",
|
||||
"weggehen",
|
||||
"weichen",
|
||||
"weitermachen",
|
||||
"werben",
|
||||
"wertschätzen",
|
||||
"wichteln",
|
||||
"widersprechen",
|
||||
"wiederholen",
|
||||
"wollen",
|
||||
"wurzeln",
|
||||
"zahlen",
|
||||
"zahnen",
|
||||
"zappeln",
|
||||
"zaubern",
|
||||
"zeichnen",
|
||||
"zelten",
|
||||
"zerdrücken",
|
||||
"zeugen",
|
||||
"ziehen",
|
||||
"zieren",
|
||||
"zischen",
|
||||
"zivilisieren",
|
||||
"zubereiten",
|
||||
"zucken",
|
||||
"zudecken",
|
||||
"zurückweichen",
|
||||
"zusammenleben",
|
||||
"zustossen",
|
||||
"zwingen",
|
||||
}
|
94
tagger.py
94
tagger.py
|
@ -1,33 +1,73 @@
|
|||
from pathlib import Path
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from html.parser import HTMLParser
|
||||
from pathlib import Path
|
||||
|
||||
from exclude import EXCLUDED_WORDS
|
||||
|
||||
upper_check = re.compile(r'[A-Z]')
|
||||
|
||||
|
||||
def scan_file(file: Path):
|
||||
content = read_file(file)
|
||||
@dataclass
|
||||
class Tag:
|
||||
name: str
|
||||
score: int
|
||||
|
||||
|
||||
class FileScanner(HTMLParser):
|
||||
def __init__(self, file: Path):
|
||||
super().__init__()
|
||||
self.file = file
|
||||
self.texte = []
|
||||
|
||||
words_with_usage = {}
|
||||
words = re.split(' /-_', content)
|
||||
print(f'File {file.parent.name} contains {len(words)} words')
|
||||
title_words = set(file.name.split('-'))
|
||||
for word in words:
|
||||
word = word.strip(" .,:;-_!\"'<>")
|
||||
score = 10
|
||||
if word in title_words:
|
||||
score *= 4
|
||||
if len(word) <= 3:
|
||||
score //= 2
|
||||
if word in words_with_usage:
|
||||
words_with_usage[word] += score
|
||||
else:
|
||||
words_with_usage[word] = score
|
||||
sorted_list = sorted(words_with_usage.items(), key=lambda item: item[1], reverse=True)
|
||||
display_result(sorted_list)
|
||||
def scan_file(self):
|
||||
content = read_file(self.file)
|
||||
self.feed(content)
|
||||
|
||||
words_with_usage = {}
|
||||
words = []
|
||||
for text in self.texte:
|
||||
words += re.split(r'[ /\-_#\n]', text)
|
||||
print(f'\nFile {self.file.parent.name} contains {len(words)} words')
|
||||
title_words = set(self.file.parent.name.split('-'))
|
||||
for word in words:
|
||||
tag_name = word.strip(".,:;!\"'<>()")
|
||||
word = tag_name.lower()
|
||||
if not word:
|
||||
continue
|
||||
score = 10
|
||||
if word in EXCLUDED_WORDS:
|
||||
score = 0
|
||||
if word in title_words:
|
||||
score *= 4
|
||||
if len(word) <= 3:
|
||||
score //= 2
|
||||
upper_letters_count = len(upper_check.findall(tag_name))
|
||||
score *= upper_letters_count
|
||||
if word not in words_with_usage:
|
||||
words_with_usage[word] = Tag(name=tag_name, score=score)
|
||||
else:
|
||||
words_with_usage[word].score += score
|
||||
sorted_list = sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=False)
|
||||
display_result(sorted_list)
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag != "a":
|
||||
return
|
||||
for attr_name, attr_value in attrs:
|
||||
if attr_name == "href":
|
||||
self.texte.append(attr_value)
|
||||
break
|
||||
|
||||
def handle_data(self, data):
|
||||
self.texte.append(data)
|
||||
|
||||
|
||||
def display_result(result):
|
||||
for word, usage in result:
|
||||
if usage > 1:
|
||||
print(f"Score: {usage:>3} Word: {word}")
|
||||
for tag in result:
|
||||
if tag.score <= 10:
|
||||
continue
|
||||
print(f"Score: {tag.score:>3} Word: {tag.name}")
|
||||
|
||||
|
||||
def read_file(file: Path) -> str:
|
||||
|
@ -36,9 +76,11 @@ def read_file(file: Path) -> str:
|
|||
|
||||
|
||||
def main(source=Path('data')):
|
||||
for file in source.glob('**/index.txt'):
|
||||
scan_file(file)
|
||||
break
|
||||
for index, file in enumerate(source.glob('**/index.txt')):
|
||||
scanner = FileScanner(file)
|
||||
scanner.scan_file()
|
||||
# if index == 3:
|
||||
# break
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
Loading…
Reference in a new issue