exclude code blocks
This commit is contained in:
parent
6a68caca90
commit
0c6c621919
1 changed files with 6 additions and 2 deletions
|
@ -13,6 +13,7 @@ SOURCE_FILENAME = 'index.txt'
|
|||
OUTPUT_FILE = 'tags.json'
|
||||
TAGS_PER_ARTICLE = 5
|
||||
JSON_INDENT = 2
|
||||
EXCLUDED_HTML_TAGS = {'code'}
|
||||
|
||||
# Wegen Performance vordefinierte Variablen
|
||||
_UPPER_CHECK = re.compile(r'[A-Z]')
|
||||
|
@ -29,6 +30,7 @@ class FileScanner(HTMLParser):
|
|||
super().__init__()
|
||||
self.file = file
|
||||
self.texte = []
|
||||
self._current_html_tag = None
|
||||
|
||||
def scan_file(self):
|
||||
# Datei einlesen
|
||||
|
@ -86,6 +88,7 @@ class FileScanner(HTMLParser):
|
|||
return sorted(words_with_usage.values(), key=lambda tag: tag.score, reverse=True)
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
self._current_html_tag = tag
|
||||
# Die Links, die in den 'href' Attributen eines <a> HTML-Elements stehen, mit einbeziehen.
|
||||
if tag != "a":
|
||||
return
|
||||
|
@ -95,8 +98,9 @@ class FileScanner(HTMLParser):
|
|||
break
|
||||
|
||||
def handle_data(self, data):
|
||||
# Den Text innerhalb eines HTML-Elements mit einbeziehen.
|
||||
self.texte.append(data)
|
||||
if self._current_html_tag not in EXCLUDED_HTML_TAGS:
|
||||
# Den Text innerhalb eines HTML-Elements mit einbeziehen.
|
||||
self.texte.append(data)
|
||||
|
||||
|
||||
def display_tags(tags, min_score):
|
||||
|
|
Loading…
Reference in a new issue