Tagger/tagger.py

116 lines
3.1 KiB
Python
Raw Normal View History

2022-05-28 12:49:10 +00:00
import json
2022-05-05 16:01:10 +00:00
import re
2022-05-05 17:30:56 +00:00
from dataclasses import dataclass
from html.parser import HTMLParser
2022-05-28 13:34:41 +00:00
from urllib.parse import unquote_plus
2022-05-05 17:30:56 +00:00
from pathlib import Path
from exclude import EXCLUDED_WORDS
2022-05-28 12:49:10 +00:00
SOURCE_DIR = Path('data')
OUTPUT_FILE = 'tags.json'
TAGS_PER_ARTICLE = 5
JSON_INDENT = 2
2022-05-28 13:17:33 +00:00
UPPER_CHECK = re.compile(r'[A-Z]')
2022-05-05 16:01:10 +00:00
2022-05-05 17:30:56 +00:00
@dataclass
class Tag:
name: str
score: int
class FileScanner(HTMLParser):
def __init__(self, file: Path):
super().__init__()
self.file = file
self.texte = []
def scan_file(self):
content = read_file(self.file)
self.feed(content)
words_with_usage = {}
words = []
for text in self.texte:
2022-05-28 13:34:41 +00:00
text = unquote_plus(text)
2022-05-28 13:33:16 +00:00
words += re.split(r'[ /\-_#\n.?=]', text)
2022-05-05 17:30:56 +00:00
print(f'\nFile {self.file.parent.name} contains {len(words)} words')
title_words = set(self.file.parent.name.split('-'))
for word in words:
tag_name = word.strip(".,:;!\"'<>()")
2022-05-28 12:50:00 +00:00
if not tag_name:
2022-05-05 17:30:56 +00:00
continue
2022-05-28 12:50:00 +00:00
word = tag_name.lower()
2022-05-05 17:30:56 +00:00
score = 10
if word in EXCLUDED_WORDS:
2022-05-28 13:15:55 +00:00
continue
2022-05-28 13:33:29 +00:00
if word.isdigit():
continue
2022-05-05 17:30:56 +00:00
if word in title_words:
score *= 4
word_length = len(word)
if word_length <= 3:
score = int(score * word_length / 4)
2022-05-28 13:17:33 +00:00
upper_letters_count = len(UPPER_CHECK.findall(tag_name))
score += upper_letters_count * 5
2022-05-05 17:30:56 +00:00
if word not in words_with_usage:
words_with_usage[word] = Tag(name=tag_name, score=score)
else:
words_with_usage[word].score += score
2022-05-28 12:49:10 +00:00
return sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True)
2022-05-05 17:30:56 +00:00
def handle_starttag(self, tag, attrs):
if tag != "a":
return
for attr_name, attr_value in attrs:
if attr_name == "href":
self.texte.append(attr_value)
break
2022-05-05 17:30:56 +00:00
def handle_data(self, data):
self.texte.append(data)
2022-05-05 16:01:10 +00:00
2022-05-28 12:52:25 +00:00
def display_tags(tags, min_score):
2022-05-28 12:49:10 +00:00
for tag in tags:
if tag.score <= min_score:
2022-05-05 17:30:56 +00:00
continue
print(f"Score: {tag.score:>3} Word: {tag.name}")
2022-05-05 16:01:10 +00:00
2022-05-28 12:49:10 +00:00
class CustomJsonEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, Tag):
return obj.name
return super().default(obj)
def write_tags(tags):
content = json.dumps(tags, indent=JSON_INDENT, cls=CustomJsonEncoder)
with open(OUTPUT_FILE, 'w') as file:
file.write(content)
2022-05-05 16:01:10 +00:00
def read_file(file: Path) -> str:
with open(file, 'r') as file:
return file.read()
2022-05-28 12:49:10 +00:00
def main():
final_tags = {}
for file in SOURCE_DIR.glob('**/index.txt'):
title = file.parent.name
if title.startswith('autosave-'):
2022-05-28 12:08:28 +00:00
continue
2022-05-05 17:30:56 +00:00
scanner = FileScanner(file)
2022-05-28 12:49:10 +00:00
tags = scanner.scan_file()
2022-05-28 12:52:25 +00:00
display_tags(tags, min_score=20)
2022-05-28 12:49:10 +00:00
final_tags[title] = tags[:TAGS_PER_ARTICLE] if len(tags) > TAGS_PER_ARTICLE else tags
# write_tags(final_tags)
2022-05-05 16:01:10 +00:00
if __name__ == '__main__':
main()