From ea1763f1f7ad8e769b47c65ef01d25654be610e8 Mon Sep 17 00:00:00 2001 From: OneNewDev Date: Sat, 28 May 2022 14:49:10 +0200 Subject: [PATCH] write output json file --- .gitignore | 1 + tagger.py | 43 ++++++++++++++++++++++++++++++++----------- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index dabf72f..40feed7 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ venv .idea data.zip *.pyc +tags.json diff --git a/tagger.py b/tagger.py index a12c8ad..d4332b2 100644 --- a/tagger.py +++ b/tagger.py @@ -1,3 +1,4 @@ +import json import re from dataclasses import dataclass from html.parser import HTMLParser @@ -5,6 +6,11 @@ from pathlib import Path from exclude import EXCLUDED_WORDS +SOURCE_DIR = Path('data') +OUTPUT_FILE = 'tags.json' +TAGS_PER_ARTICLE = 5 +JSON_INDENT = 2 + upper_check = re.compile(r'[A-Z]') @@ -48,8 +54,7 @@ class FileScanner(HTMLParser): words_with_usage[word] = Tag(name=tag_name, score=score) else: words_with_usage[word].score += score - sorted_list = sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True) - display_result(sorted_list) + return sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True) def handle_starttag(self, tag, attrs): if tag != "a": @@ -63,26 +68,42 @@ class FileScanner(HTMLParser): self.texte.append(data) -def display_result(result): - for tag in result: - if tag.score <= 10: +def display_tags(tags, min_score=10): + for tag in tags: + if tag.score <= min_score: continue print(f"Score: {tag.score:>3} Word: {tag.name}") +class CustomJsonEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, Tag): + return obj.name + return super().default(obj) + + +def write_tags(tags): + content = json.dumps(tags, indent=JSON_INDENT, cls=CustomJsonEncoder) + with open(OUTPUT_FILE, 'w') as file: + file.write(content) + + def read_file(file: Path) -> str: with open(file, 'r') as file: return file.read() -def main(source=Path('data')): - for index, file in enumerate(source.glob('**/index.txt')): - if file.parent.name.startswith('autosave-'): +def main(): + final_tags = {} + for file in SOURCE_DIR.glob('**/index.txt'): + title = file.parent.name + if title.startswith('autosave-'): continue scanner = FileScanner(file) - scanner.scan_file() - # if index == 3: - # break + tags = scanner.scan_file() + display_tags(tags) + final_tags[title] = tags[:TAGS_PER_ARTICLE] if len(tags) > TAGS_PER_ARTICLE else tags + # write_tags(final_tags) if __name__ == '__main__':