Coverage for meta_tags_parser/parse.py: 100%

82 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-09-18 21:46 +0000

1import contextvars 

2import typing 

3 

4from selectolax.lexbor import LexborHTMLParser, LexborNode 

5 

6from . import structs 

7 

8 

9if typing.TYPE_CHECKING: 

10 from collections.abc import KeysView 

11 

12 

13_GLOBAL_OPTIONS_HOLDER: typing.Final[contextvars.ContextVar[structs.SettingsFromUser]] = contextvars.ContextVar( 

14 "options", default=structs.DEFAULT_SETTINGS_FROM_USER 

15) 

16 

17 

18def set_settings_for_meta_tags(new_options: structs.SettingsFromUser) -> None: 

19 """Override default package options.""" 

20 _GLOBAL_OPTIONS_HOLDER.set(new_options) 

21 

22 

23def _slice_html_for_meta(html_source: str, active_options: structs.SettingsFromUser) -> str: 

24 scanning_prefix: str = html_source[: active_options.max_scan_chars] 

25 lowered_prefix: str = scanning_prefix.lower() 

26 earliest_position: int | None = None 

27 matched_boundary: str = "" 

28 for one_boundary_tag in active_options.boundary_tags: 

29 boundary_position: int = lowered_prefix.find(one_boundary_tag) 

30 if boundary_position != -1 and (earliest_position is None or boundary_position < earliest_position): 

31 earliest_position = boundary_position 

32 matched_boundary = one_boundary_tag 

33 if earliest_position is not None: 

34 cut_position: int = ( 

35 earliest_position + len(active_options.boundary_tags[0]) 

36 if matched_boundary == active_options.boundary_tags[0] 

37 else earliest_position 

38 ) 

39 limit_position: int = ( 

40 cut_position 

41 if active_options.hard_limit_chars is None 

42 else min(cut_position, active_options.hard_limit_chars) 

43 ) 

44 return html_source[:limit_position] 

45 return html_source[: active_options.fallback_limit_chars] 

46 

47 

48def _extract_social_tags_from_precursor( 

49 all_tech_attrs: list[dict[str, structs.ValuesGroup]], 

50 media_type: typing.Literal[structs.WhatToParse.OPEN_GRAPH, structs.WhatToParse.TWITTER], 

51) -> list[structs.OneMetaTag]: 

52 possible_settings_for_parsing: typing.Final[typing.Mapping[str, str | tuple[str, ...]]] = ( 

53 structs.SETTINGS_FOR_SOCIAL_MEDIA[media_type] 

54 ) 

55 output_buffer: typing.Final[list[structs.OneMetaTag]] = [] 

56 for one_attr_group in all_tech_attrs: 

57 og_tag_name: str = "" 

58 tech_keys: KeysView[str] = one_attr_group.keys() 

59 for attr_name in possible_settings_for_parsing["prop"]: 

60 if attr_name in tech_keys and one_attr_group[attr_name].normalized.startswith( 

61 possible_settings_for_parsing["prefix"] 

62 ): 

63 og_tag_name = one_attr_group[attr_name].normalized.replace( 

64 str(possible_settings_for_parsing["prefix"]), "" 

65 ) 

66 if og_tag_name and "content" in tech_keys and one_attr_group["content"].original: 

67 output_buffer.append(structs.OneMetaTag(name=og_tag_name, value=one_attr_group["content"].original)) 

68 break 

69 return output_buffer 

70 

71 

72def _extract_basic_tags_from_precursor( 

73 all_tech_attrs: list[dict[str, structs.ValuesGroup]], 

74) -> list[structs.OneMetaTag]: 

75 output_buffer: typing.Final[list[structs.OneMetaTag]] = [] 

76 for one_attr_group in all_tech_attrs: 

77 tech_keys: KeysView[str] = one_attr_group.keys() 

78 

79 if len(output_buffer) == len(structs.BASIC_META_TAGS): 

80 break 

81 output_buffer.extend( 

82 structs.OneMetaTag( 

83 name=one_ordinary_meta_tag, 

84 value=one_attr_group["content"].original, 

85 ) 

86 for one_ordinary_meta_tag in structs.BASIC_META_TAGS 

87 if ( 

88 "name" in tech_keys 

89 and one_attr_group["name"].normalized == one_ordinary_meta_tag 

90 and "content" in one_attr_group 

91 and one_attr_group["content"].original 

92 ) 

93 ) 

94 return output_buffer 

95 

96 

97def _extract_all_other_tags_from_precursor( 

98 all_tech_attrs: list[dict[str, structs.ValuesGroup]], 

99) -> list[structs.OneMetaTag]: 

100 output_buffer: typing.Final[list[structs.OneMetaTag]] = [] 

101 for one_attr_group in all_tech_attrs: 

102 tech_keys: KeysView[str] = one_attr_group.keys() 

103 

104 should_we_skip: bool = False 

105 for one_config in structs.SETTINGS_FOR_SOCIAL_MEDIA.values(): 

106 for attr_name in one_config["prop"]: 

107 if attr_name in tech_keys and one_attr_group[attr_name].normalized.startswith(one_config["prefix"]): 

108 should_we_skip = True 

109 break 

110 if should_we_skip: 

111 continue 

112 

113 if "name" in tech_keys: 

114 if one_attr_group["name"].normalized in structs.BASIC_META_TAGS: 

115 continue 

116 if "content" in one_attr_group and one_attr_group["content"].original: 

117 output_buffer.append( 

118 structs.OneMetaTag( 

119 name=one_attr_group["name"].normalized, 

120 value=one_attr_group["content"].original, 

121 ) 

122 ) 

123 return output_buffer 

124 

125 

126def _prepare_normalized_meta_attrs(html_tree: LexborHTMLParser) -> list[dict[str, structs.ValuesGroup]]: 

127 normalized_meta_attrs: typing.Final[list[dict[str, structs.ValuesGroup]]] = [] 

128 for meta_node in html_tree.css("meta"): 

129 prepared_attrs: dict[str, structs.ValuesGroup] = {} 

130 for attr_name, raw_value in meta_node.attributes.items(): 

131 prepared_value: str = raw_value or "" 

132 prepared_attrs[attr_name.lower().strip()] = structs.ValuesGroup( 

133 original=prepared_value, 

134 normalized=prepared_value.lower().strip(), 

135 ) 

136 normalized_meta_attrs.append(prepared_attrs) 

137 return normalized_meta_attrs 

138 

139 

140def parse_meta_tags_from_source( 

141 source_code: str | bytes, 

142 *, 

143 options: structs.SettingsFromUser | None = None, 

144) -> structs.TagsGroup: 

145 normalized_source: typing.Final = typing.cast( 

146 "str", source_code.decode(errors="ignore") if isinstance(source_code, bytes) else source_code 

147 ) 

148 active_options: structs.SettingsFromUser = options or _GLOBAL_OPTIONS_HOLDER.get() 

149 html_tree: typing.Final[LexborHTMLParser] = LexborHTMLParser( 

150 _slice_html_for_meta(normalized_source, active_options) if active_options.optimize_input else normalized_source 

151 ) 

152 title_node: typing.Final[LexborNode | None] = ( 

153 html_tree.css_first("title") if structs.WhatToParse.TITLE in active_options.what_to_parse else None 

154 ) 

155 page_title: typing.Final[str] = title_node.text().strip() if title_node else "" 

156 normalized_meta_attrs: typing.Final[list[dict[str, structs.ValuesGroup]]] = ( 

157 _prepare_normalized_meta_attrs(html_tree) 

158 if any( 

159 one_item in active_options.what_to_parse 

160 for one_item in ( 

161 structs.WhatToParse.OPEN_GRAPH, 

162 structs.WhatToParse.TWITTER, 

163 structs.WhatToParse.BASIC, 

164 structs.WhatToParse.OTHER, 

165 ) 

166 ) 

167 else [] 

168 ) 

169 

170 open_graph_meta_tags: typing.Final[list[structs.OneMetaTag]] = ( 

171 _extract_social_tags_from_precursor(normalized_meta_attrs, structs.WhatToParse.OPEN_GRAPH) 

172 if structs.WhatToParse.OPEN_GRAPH in active_options.what_to_parse 

173 else [] 

174 ) 

175 twitter_meta_tags: typing.Final[list[structs.OneMetaTag]] = ( 

176 _extract_social_tags_from_precursor(normalized_meta_attrs, structs.WhatToParse.TWITTER) 

177 if structs.WhatToParse.TWITTER in active_options.what_to_parse 

178 else [] 

179 ) 

180 basic_meta_tags: typing.Final[list[structs.OneMetaTag]] = ( 

181 _extract_basic_tags_from_precursor(normalized_meta_attrs) 

182 if structs.WhatToParse.BASIC in active_options.what_to_parse 

183 else [] 

184 ) 

185 other_meta_tags: typing.Final[list[structs.OneMetaTag]] = ( 

186 _extract_all_other_tags_from_precursor(normalized_meta_attrs) 

187 if structs.WhatToParse.OTHER in active_options.what_to_parse 

188 else [] 

189 ) 

190 

191 return structs.TagsGroup( 

192 title=page_title, 

193 basic=basic_meta_tags, 

194 open_graph=open_graph_meta_tags, 

195 twitter=twitter_meta_tags, 

196 other=other_meta_tags, 

197 )