write output json file

This commit is contained in:
OneNewDev 2022-05-28 14:49:10 +02:00
parent 76a38c84dc
commit ea1763f1f7
2 changed files with 33 additions and 11 deletions

1
.gitignore vendored
View file

@ -2,3 +2,4 @@ venv
.idea
data.zip
*.pyc
tags.json

View file

@ -1,3 +1,4 @@
import json
import re
from dataclasses import dataclass
from html.parser import HTMLParser
@ -5,6 +6,11 @@ from pathlib import Path
from exclude import EXCLUDED_WORDS
SOURCE_DIR = Path('data')
OUTPUT_FILE = 'tags.json'
TAGS_PER_ARTICLE = 5
JSON_INDENT = 2
upper_check = re.compile(r'[A-Z]')
@ -48,8 +54,7 @@ class FileScanner(HTMLParser):
words_with_usage[word] = Tag(name=tag_name, score=score)
else:
words_with_usage[word].score += score
sorted_list = sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True)
display_result(sorted_list)
return sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True)
def handle_starttag(self, tag, attrs):
if tag != "a":
@ -63,26 +68,42 @@ class FileScanner(HTMLParser):
self.texte.append(data)
def display_result(result):
for tag in result:
if tag.score <= 10:
def display_tags(tags, min_score=10):
for tag in tags:
if tag.score <= min_score:
continue
print(f"Score: {tag.score:>3} Word: {tag.name}")
class CustomJsonEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, Tag):
return obj.name
return super().default(obj)
def write_tags(tags):
content = json.dumps(tags, indent=JSON_INDENT, cls=CustomJsonEncoder)
with open(OUTPUT_FILE, 'w') as file:
file.write(content)
def read_file(file: Path) -> str:
with open(file, 'r') as file:
return file.read()
def main(source=Path('data')):
for index, file in enumerate(source.glob('**/index.txt')):
if file.parent.name.startswith('autosave-'):
def main():
final_tags = {}
for file in SOURCE_DIR.glob('**/index.txt'):
title = file.parent.name
if title.startswith('autosave-'):
continue
scanner = FileScanner(file)
scanner.scan_file()
# if index == 3:
# break
tags = scanner.scan_file()
display_tags(tags)
final_tags[title] = tags[:TAGS_PER_ARTICLE] if len(tags) > TAGS_PER_ARTICLE else tags
# write_tags(final_tags)
if __name__ == '__main__':