Tagger/tagger.py

import json
import re
from dataclasses import dataclass
from html.parser import HTMLParser
from pathlib import Path

from exclude import EXCLUDED_WORDS

SOURCE_DIR = Path('data')
OUTPUT_FILE = 'tags.json'
TAGS_PER_ARTICLE = 5
JSON_INDENT = 2

upper_check = re.compile(r'[A-Z]')


@dataclass
class Tag:
    name: str
    score: int


class FileScanner(HTMLParser):
    def __init__(self, file: Path):
        super().__init__()
        self.file = file
        self.texte = []

    def scan_file(self):
        content = read_file(self.file)
        self.feed(content)

        words_with_usage = {}
        words = []
        for text in self.texte:
            words += re.split(r'[ /\-_#\n]', text)
        print(f'\nFile {self.file.parent.name} contains {len(words)} words')
        title_words = set(self.file.parent.name.split('-'))
        for word in words:
            tag_name = word.strip(".,:;!\"'<>()")
            word = tag_name.lower()
            if not word:
                continue
            score = 10
            if word in EXCLUDED_WORDS:
                score = 0
            if word in title_words:
                score *= 4
            if len(word) <= 3:
                score //= 2
            upper_letters_count = len(upper_check.findall(tag_name))
            score += upper_letters_count * 5
            if word not in words_with_usage:
                words_with_usage[word] = Tag(name=tag_name, score=score)
            else:
                words_with_usage[word].score += score
        return sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True)

    def handle_starttag(self, tag, attrs):
        if tag != "a":
            return
        for attr_name, attr_value in attrs:
            if attr_name == "href":
                self.texte.append(attr_value)
                break

    def handle_data(self, data):
        self.texte.append(data)


def display_tags(tags, min_score=10):
    for tag in tags:
        if tag.score <= min_score:
            continue
        print(f"Score: {tag.score:>3} Word: {tag.name}")


class CustomJsonEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, Tag):
            return obj.name
        return super().default(obj)


def write_tags(tags):
    content = json.dumps(tags, indent=JSON_INDENT, cls=CustomJsonEncoder)
    with open(OUTPUT_FILE, 'w') as file:
        file.write(content)


def read_file(file: Path) -> str:
    with open(file, 'r') as file:
        return file.read()


def main():
    final_tags = {}
    for file in SOURCE_DIR.glob('**/index.txt'):
        title = file.parent.name
        if title.startswith('autosave-'):
            continue
        scanner = FileScanner(file)
        tags = scanner.scan_file()
        display_tags(tags)
        final_tags[title] = tags[:TAGS_PER_ARTICLE] if len(tags) > TAGS_PER_ARTICLE else tags
    # write_tags(final_tags)


if __name__ == '__main__':
    main()