import json import re from dataclasses import dataclass from html.parser import HTMLParser from urllib.parse import unquote_plus from pathlib import Path from exclude import EXCLUDED_WORDS SOURCE_DIR = Path('data') OUTPUT_FILE = 'tags.json' TAGS_PER_ARTICLE = 5 JSON_INDENT = 2 UPPER_CHECK = re.compile(r'[A-Z]') @dataclass class Tag: name: str score: int class FileScanner(HTMLParser): def __init__(self, file: Path): super().__init__() self.file = file self.texte = [] def scan_file(self): content = read_file(self.file) self.feed(content) words_with_usage = {} words = [] for text in self.texte: text = unquote_plus(text) words += re.split(r'[ /\-_#\n.?=]', text) print(f'\nFile {self.file.parent.name} contains {len(words)} words') title_words = set(self.file.parent.name.split('-')) for word in words: tag_name = word.strip(".,:;!\"'<>()") if not tag_name: continue word = tag_name.lower() score = 10 if word in EXCLUDED_WORDS: continue if word.isdigit(): continue if word in title_words: score *= 4 word_length = len(word) if word_length <= 3: score = int(score * word_length / 4) upper_letters_count = len(UPPER_CHECK.findall(tag_name)) score += upper_letters_count * 5 if word not in words_with_usage: words_with_usage[word] = Tag(name=tag_name, score=score) else: words_with_usage[word].score += score return sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True) def handle_starttag(self, tag, attrs): if tag != "a": return for attr_name, attr_value in attrs: if attr_name == "href": self.texte.append(attr_value) break def handle_data(self, data): self.texte.append(data) def display_tags(tags, min_score): for tag in tags: if tag.score <= min_score: continue print(f"Score: {tag.score:>3} Word: {tag.name}") class CustomJsonEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, Tag): return obj.name return super().default(obj) def write_tags(tags): content = json.dumps(tags, indent=JSON_INDENT, cls=CustomJsonEncoder) with open(OUTPUT_FILE, 'w') as file: file.write(content) def read_file(file: Path) -> str: with open(file, 'r') as file: return file.read() def main(): final_tags = {} for file in SOURCE_DIR.glob('**/index.txt'): title = file.parent.name if title.startswith('autosave-'): continue scanner = FileScanner(file) tags = scanner.scan_file() display_tags(tags, min_score=20) final_tags[title] = tags[:TAGS_PER_ARTICLE] if len(tags) > TAGS_PER_ARTICLE else tags # write_tags(final_tags) if __name__ == '__main__': main()