87 lines
2.3 KiB
Python
87 lines
2.3 KiB
Python
import re
|
|
from dataclasses import dataclass
|
|
from html.parser import HTMLParser
|
|
from pathlib import Path
|
|
|
|
from exclude import EXCLUDED_WORDS
|
|
|
|
upper_check = re.compile(r'[A-Z]')
|
|
|
|
|
|
@dataclass
|
|
class Tag:
|
|
name: str
|
|
score: int
|
|
|
|
|
|
class FileScanner(HTMLParser):
|
|
def __init__(self, file: Path):
|
|
super().__init__()
|
|
self.file = file
|
|
self.texte = []
|
|
|
|
def scan_file(self):
|
|
content = read_file(self.file)
|
|
self.feed(content)
|
|
|
|
words_with_usage = {}
|
|
words = []
|
|
for text in self.texte:
|
|
words += re.split(r'[ /\-_#\n]', text)
|
|
print(f'\nFile {self.file.parent.name} contains {len(words)} words')
|
|
title_words = set(self.file.parent.name.split('-'))
|
|
for word in words:
|
|
tag_name = word.strip(".,:;!\"'<>()")
|
|
word = tag_name.lower()
|
|
if not word:
|
|
continue
|
|
score = 10
|
|
if word in EXCLUDED_WORDS:
|
|
score = 0
|
|
if word in title_words:
|
|
score *= 4
|
|
if len(word) <= 3:
|
|
score //= 2
|
|
upper_letters_count = len(upper_check.findall(tag_name))
|
|
score += upper_letters_count * 5
|
|
if word not in words_with_usage:
|
|
words_with_usage[word] = Tag(name=tag_name, score=score)
|
|
else:
|
|
words_with_usage[word].score += score
|
|
sorted_list = sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True)
|
|
display_result(sorted_list)
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
if tag != "a":
|
|
return
|
|
for attr_name, attr_value in attrs:
|
|
if attr_name == "href":
|
|
self.texte.append(attr_value)
|
|
break
|
|
|
|
def handle_data(self, data):
|
|
self.texte.append(data)
|
|
|
|
|
|
def display_result(result):
|
|
for tag in result:
|
|
if tag.score <= 10:
|
|
continue
|
|
print(f"Score: {tag.score:>3} Word: {tag.name}")
|
|
|
|
|
|
def read_file(file: Path) -> str:
|
|
with open(file, 'r') as file:
|
|
return file.read()
|
|
|
|
|
|
def main(source=Path('data')):
|
|
for index, file in enumerate(source.glob('**/index.txt')):
|
|
scanner = FileScanner(file)
|
|
scanner.scan_file()
|
|
# if index == 3:
|
|
# break
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|