write output json file
This commit is contained in:
parent
76a38c84dc
commit
ea1763f1f7
2 changed files with 33 additions and 11 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -2,3 +2,4 @@ venv
|
|||
.idea
|
||||
data.zip
|
||||
*.pyc
|
||||
tags.json
|
||||
|
|
43
tagger.py
43
tagger.py
|
@ -1,3 +1,4 @@
|
|||
import json
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from html.parser import HTMLParser
|
||||
|
@ -5,6 +6,11 @@ from pathlib import Path
|
|||
|
||||
from exclude import EXCLUDED_WORDS
|
||||
|
||||
SOURCE_DIR = Path('data')
|
||||
OUTPUT_FILE = 'tags.json'
|
||||
TAGS_PER_ARTICLE = 5
|
||||
JSON_INDENT = 2
|
||||
|
||||
upper_check = re.compile(r'[A-Z]')
|
||||
|
||||
|
||||
|
@ -48,8 +54,7 @@ class FileScanner(HTMLParser):
|
|||
words_with_usage[word] = Tag(name=tag_name, score=score)
|
||||
else:
|
||||
words_with_usage[word].score += score
|
||||
sorted_list = sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True)
|
||||
display_result(sorted_list)
|
||||
return sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True)
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag != "a":
|
||||
|
@ -63,26 +68,42 @@ class FileScanner(HTMLParser):
|
|||
self.texte.append(data)
|
||||
|
||||
|
||||
def display_result(result):
|
||||
for tag in result:
|
||||
if tag.score <= 10:
|
||||
def display_tags(tags, min_score=10):
|
||||
for tag in tags:
|
||||
if tag.score <= min_score:
|
||||
continue
|
||||
print(f"Score: {tag.score:>3} Word: {tag.name}")
|
||||
|
||||
|
||||
class CustomJsonEncoder(json.JSONEncoder):
|
||||
def default(self, obj):
|
||||
if isinstance(obj, Tag):
|
||||
return obj.name
|
||||
return super().default(obj)
|
||||
|
||||
|
||||
def write_tags(tags):
|
||||
content = json.dumps(tags, indent=JSON_INDENT, cls=CustomJsonEncoder)
|
||||
with open(OUTPUT_FILE, 'w') as file:
|
||||
file.write(content)
|
||||
|
||||
|
||||
def read_file(file: Path) -> str:
|
||||
with open(file, 'r') as file:
|
||||
return file.read()
|
||||
|
||||
|
||||
def main(source=Path('data')):
|
||||
for index, file in enumerate(source.glob('**/index.txt')):
|
||||
if file.parent.name.startswith('autosave-'):
|
||||
def main():
|
||||
final_tags = {}
|
||||
for file in SOURCE_DIR.glob('**/index.txt'):
|
||||
title = file.parent.name
|
||||
if title.startswith('autosave-'):
|
||||
continue
|
||||
scanner = FileScanner(file)
|
||||
scanner.scan_file()
|
||||
# if index == 3:
|
||||
# break
|
||||
tags = scanner.scan_file()
|
||||
display_tags(tags)
|
||||
final_tags[title] = tags[:TAGS_PER_ARTICLE] if len(tags) > TAGS_PER_ARTICLE else tags
|
||||
# write_tags(final_tags)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
Loading…
Reference in a new issue