From ea1763f1f7ad8e769b47c65ef01d25654be610e8 Mon Sep 17 00:00:00 2001
From: OneNewDev <onenewdev@mailbox.org>
Date: Sat, 28 May 2022 14:49:10 +0200
Subject: [PATCH] write output json file

---
 .gitignore |  1 +
 tagger.py  | 43 ++++++++++++++++++++++++++++++++-----------
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/.gitignore b/.gitignore
index dabf72f..40feed7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ venv
 .idea
 data.zip
 *.pyc
+tags.json
diff --git a/tagger.py b/tagger.py
index a12c8ad..d4332b2 100644
--- a/tagger.py
+++ b/tagger.py
@@ -1,3 +1,4 @@
+import json
 import re
 from dataclasses import dataclass
 from html.parser import HTMLParser
@@ -5,6 +6,11 @@ from pathlib import Path
 
 from exclude import EXCLUDED_WORDS
 
+SOURCE_DIR = Path('data')
+OUTPUT_FILE = 'tags.json'
+TAGS_PER_ARTICLE = 5
+JSON_INDENT = 2
+
 upper_check = re.compile(r'[A-Z]')
 
 
@@ -48,8 +54,7 @@ class FileScanner(HTMLParser):
                 words_with_usage[word] = Tag(name=tag_name, score=score)
             else:
                 words_with_usage[word].score += score
-        sorted_list = sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True)
-        display_result(sorted_list)
+        return sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True)
     
     def handle_starttag(self, tag, attrs):
         if tag != "a":
@@ -63,26 +68,42 @@ class FileScanner(HTMLParser):
         self.texte.append(data)
 
 
-def display_result(result):
-    for tag in result:
-        if tag.score <= 10:
+def display_tags(tags, min_score=10):
+    for tag in tags:
+        if tag.score <= min_score:
             continue
         print(f"Score: {tag.score:>3} Word: {tag.name}")
 
 
+class CustomJsonEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, Tag):
+            return obj.name
+        return super().default(obj)
+
+
+def write_tags(tags):
+    content = json.dumps(tags, indent=JSON_INDENT, cls=CustomJsonEncoder)
+    with open(OUTPUT_FILE, 'w') as file:
+        file.write(content)
+
+
 def read_file(file: Path) -> str:
     with open(file, 'r') as file:
         return file.read()
 
 
-def main(source=Path('data')):
-    for index, file in enumerate(source.glob('**/index.txt')):
-        if file.parent.name.startswith('autosave-'):
+def main():
+    final_tags = {}
+    for file in SOURCE_DIR.glob('**/index.txt'):
+        title = file.parent.name
+        if title.startswith('autosave-'):
             continue
         scanner = FileScanner(file)
-        scanner.scan_file()
-        # if index == 3:
-        #     break
+        tags = scanner.scan_file()
+        display_tags(tags)
+        final_tags[title] = tags[:TAGS_PER_ARTICLE] if len(tags) > TAGS_PER_ARTICLE else tags
+    # write_tags(final_tags)
 
 
 if __name__ == '__main__':