unquote url
This commit is contained in:
parent
737634e740
commit
819d6d51b9
1 changed files with 2 additions and 0 deletions
|
@ -2,6 +2,7 @@ import json
|
|||
import re
|
||||
from dataclasses import dataclass
|
||||
from html.parser import HTMLParser
|
||||
from urllib.parse import unquote_plus
|
||||
from pathlib import Path
|
||||
|
||||
from exclude import EXCLUDED_WORDS
|
||||
|
@ -33,6 +34,7 @@ class FileScanner(HTMLParser):
|
|||
words_with_usage = {}
|
||||
words = []
|
||||
for text in self.texte:
|
||||
text = unquote_plus(text)
|
||||
words += re.split(r'[ /\-_#\n.?=]', text)
|
||||
print(f'\nFile {self.file.parent.name} contains {len(words)} words')
|
||||
title_words = set(self.file.parent.name.split('-'))
|
||||
|
|
Loading…
Reference in a new issue