Coverage for meta_tags_parser/parse.py: 100%
82 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-09-18 21:46 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-09-18 21:46 +0000
1import contextvars
2import typing
4from selectolax.lexbor import LexborHTMLParser, LexborNode
6from . import structs
9if typing.TYPE_CHECKING:
10 from collections.abc import KeysView
13_GLOBAL_OPTIONS_HOLDER: typing.Final[contextvars.ContextVar[structs.SettingsFromUser]] = contextvars.ContextVar(
14 "options", default=structs.DEFAULT_SETTINGS_FROM_USER
15)
18def set_settings_for_meta_tags(new_options: structs.SettingsFromUser) -> None:
19 """Override default package options."""
20 _GLOBAL_OPTIONS_HOLDER.set(new_options)
23def _slice_html_for_meta(html_source: str, active_options: structs.SettingsFromUser) -> str:
24 scanning_prefix: str = html_source[: active_options.max_scan_chars]
25 lowered_prefix: str = scanning_prefix.lower()
26 earliest_position: int | None = None
27 matched_boundary: str = ""
28 for one_boundary_tag in active_options.boundary_tags:
29 boundary_position: int = lowered_prefix.find(one_boundary_tag)
30 if boundary_position != -1 and (earliest_position is None or boundary_position < earliest_position):
31 earliest_position = boundary_position
32 matched_boundary = one_boundary_tag
33 if earliest_position is not None:
34 cut_position: int = (
35 earliest_position + len(active_options.boundary_tags[0])
36 if matched_boundary == active_options.boundary_tags[0]
37 else earliest_position
38 )
39 limit_position: int = (
40 cut_position
41 if active_options.hard_limit_chars is None
42 else min(cut_position, active_options.hard_limit_chars)
43 )
44 return html_source[:limit_position]
45 return html_source[: active_options.fallback_limit_chars]
48def _extract_social_tags_from_precursor(
49 all_tech_attrs: list[dict[str, structs.ValuesGroup]],
50 media_type: typing.Literal[structs.WhatToParse.OPEN_GRAPH, structs.WhatToParse.TWITTER],
51) -> list[structs.OneMetaTag]:
52 possible_settings_for_parsing: typing.Final[typing.Mapping[str, str | tuple[str, ...]]] = (
53 structs.SETTINGS_FOR_SOCIAL_MEDIA[media_type]
54 )
55 output_buffer: typing.Final[list[structs.OneMetaTag]] = []
56 for one_attr_group in all_tech_attrs:
57 og_tag_name: str = ""
58 tech_keys: KeysView[str] = one_attr_group.keys()
59 for attr_name in possible_settings_for_parsing["prop"]:
60 if attr_name in tech_keys and one_attr_group[attr_name].normalized.startswith(
61 possible_settings_for_parsing["prefix"]
62 ):
63 og_tag_name = one_attr_group[attr_name].normalized.replace(
64 str(possible_settings_for_parsing["prefix"]), ""
65 )
66 if og_tag_name and "content" in tech_keys and one_attr_group["content"].original:
67 output_buffer.append(structs.OneMetaTag(name=og_tag_name, value=one_attr_group["content"].original))
68 break
69 return output_buffer
72def _extract_basic_tags_from_precursor(
73 all_tech_attrs: list[dict[str, structs.ValuesGroup]],
74) -> list[structs.OneMetaTag]:
75 output_buffer: typing.Final[list[structs.OneMetaTag]] = []
76 for one_attr_group in all_tech_attrs:
77 tech_keys: KeysView[str] = one_attr_group.keys()
79 if len(output_buffer) == len(structs.BASIC_META_TAGS):
80 break
81 output_buffer.extend(
82 structs.OneMetaTag(
83 name=one_ordinary_meta_tag,
84 value=one_attr_group["content"].original,
85 )
86 for one_ordinary_meta_tag in structs.BASIC_META_TAGS
87 if (
88 "name" in tech_keys
89 and one_attr_group["name"].normalized == one_ordinary_meta_tag
90 and "content" in one_attr_group
91 and one_attr_group["content"].original
92 )
93 )
94 return output_buffer
97def _extract_all_other_tags_from_precursor(
98 all_tech_attrs: list[dict[str, structs.ValuesGroup]],
99) -> list[structs.OneMetaTag]:
100 output_buffer: typing.Final[list[structs.OneMetaTag]] = []
101 for one_attr_group in all_tech_attrs:
102 tech_keys: KeysView[str] = one_attr_group.keys()
104 should_we_skip: bool = False
105 for one_config in structs.SETTINGS_FOR_SOCIAL_MEDIA.values():
106 for attr_name in one_config["prop"]:
107 if attr_name in tech_keys and one_attr_group[attr_name].normalized.startswith(one_config["prefix"]):
108 should_we_skip = True
109 break
110 if should_we_skip:
111 continue
113 if "name" in tech_keys:
114 if one_attr_group["name"].normalized in structs.BASIC_META_TAGS:
115 continue
116 if "content" in one_attr_group and one_attr_group["content"].original:
117 output_buffer.append(
118 structs.OneMetaTag(
119 name=one_attr_group["name"].normalized,
120 value=one_attr_group["content"].original,
121 )
122 )
123 return output_buffer
126def _prepare_normalized_meta_attrs(html_tree: LexborHTMLParser) -> list[dict[str, structs.ValuesGroup]]:
127 normalized_meta_attrs: typing.Final[list[dict[str, structs.ValuesGroup]]] = []
128 for meta_node in html_tree.css("meta"):
129 prepared_attrs: dict[str, structs.ValuesGroup] = {}
130 for attr_name, raw_value in meta_node.attributes.items():
131 prepared_value: str = raw_value or ""
132 prepared_attrs[attr_name.lower().strip()] = structs.ValuesGroup(
133 original=prepared_value,
134 normalized=prepared_value.lower().strip(),
135 )
136 normalized_meta_attrs.append(prepared_attrs)
137 return normalized_meta_attrs
140def parse_meta_tags_from_source(
141 source_code: str | bytes,
142 *,
143 options: structs.SettingsFromUser | None = None,
144) -> structs.TagsGroup:
145 normalized_source: typing.Final = typing.cast(
146 "str", source_code.decode(errors="ignore") if isinstance(source_code, bytes) else source_code
147 )
148 active_options: structs.SettingsFromUser = options or _GLOBAL_OPTIONS_HOLDER.get()
149 html_tree: typing.Final[LexborHTMLParser] = LexborHTMLParser(
150 _slice_html_for_meta(normalized_source, active_options) if active_options.optimize_input else normalized_source
151 )
152 title_node: typing.Final[LexborNode | None] = (
153 html_tree.css_first("title") if structs.WhatToParse.TITLE in active_options.what_to_parse else None
154 )
155 page_title: typing.Final[str] = title_node.text().strip() if title_node else ""
156 normalized_meta_attrs: typing.Final[list[dict[str, structs.ValuesGroup]]] = (
157 _prepare_normalized_meta_attrs(html_tree)
158 if any(
159 one_item in active_options.what_to_parse
160 for one_item in (
161 structs.WhatToParse.OPEN_GRAPH,
162 structs.WhatToParse.TWITTER,
163 structs.WhatToParse.BASIC,
164 structs.WhatToParse.OTHER,
165 )
166 )
167 else []
168 )
170 open_graph_meta_tags: typing.Final[list[structs.OneMetaTag]] = (
171 _extract_social_tags_from_precursor(normalized_meta_attrs, structs.WhatToParse.OPEN_GRAPH)
172 if structs.WhatToParse.OPEN_GRAPH in active_options.what_to_parse
173 else []
174 )
175 twitter_meta_tags: typing.Final[list[structs.OneMetaTag]] = (
176 _extract_social_tags_from_precursor(normalized_meta_attrs, structs.WhatToParse.TWITTER)
177 if structs.WhatToParse.TWITTER in active_options.what_to_parse
178 else []
179 )
180 basic_meta_tags: typing.Final[list[structs.OneMetaTag]] = (
181 _extract_basic_tags_from_precursor(normalized_meta_attrs)
182 if structs.WhatToParse.BASIC in active_options.what_to_parse
183 else []
184 )
185 other_meta_tags: typing.Final[list[structs.OneMetaTag]] = (
186 _extract_all_other_tags_from_precursor(normalized_meta_attrs)
187 if structs.WhatToParse.OTHER in active_options.what_to_parse
188 else []
189 )
191 return structs.TagsGroup(
192 title=page_title,
193 basic=basic_meta_tags,
194 open_graph=open_graph_meta_tags,
195 twitter=twitter_meta_tags,
196 other=other_meta_tags,
197 )