import re from dataclasses import dataclass from html.parser import HTMLParser from pathlib import Path from exclude import EXCLUDED_WORDS upper_check = re.compile(r'[A-Z]') @dataclass class Tag: name: str score: int class FileScanner(HTMLParser): def __init__(self, file: Path): super().__init__() self.file = file self.texte = [] def scan_file(self): content = read_file(self.file) self.feed(content) words_with_usage = {} words = [] for text in self.texte: words += re.split(r'[ /\-_#\n]', text) print(f'\nFile {self.file.parent.name} contains {len(words)} words') title_words = set(self.file.parent.name.split('-')) for word in words: tag_name = word.strip(".,:;!\"'<>()") word = tag_name.lower() if not word: continue score = 10 if word in EXCLUDED_WORDS: score = 0 if word in title_words: score *= 4 if len(word) <= 3: score //= 2 upper_letters_count = len(upper_check.findall(tag_name)) score += upper_letters_count * 5 if word not in words_with_usage: words_with_usage[word] = Tag(name=tag_name, score=score) else: words_with_usage[word].score += score sorted_list = sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True) display_result(sorted_list) def handle_starttag(self, tag, attrs): if tag != "a": return for attr_name, attr_value in attrs: if attr_name == "href": self.texte.append(attr_value) break def handle_data(self, data): self.texte.append(data) def display_result(result): for tag in result: if tag.score <= 10: continue print(f"Score: {tag.score:>3} Word: {tag.name}") def read_file(file: Path) -> str: with open(file, 'r') as file: return file.read() def main(source=Path('data')): for index, file in enumerate(source.glob('**/index.txt')): if file.parent.name.startswith('autosave-'): continue scanner = FileScanner(file) scanner.scan_file() # if index == 3: # break if __name__ == '__main__': main()