diff --git a/tide/utils.py b/tide/utils.py index ee8c114..a56b339 100644 --- a/tide/utils.py +++ b/tide/utils.py @@ -6,7 +6,6 @@ from bigtree import dict_to_tree, levelordergroup_iter from bigtree.node import node from typing import TypeVar -import re T = TypeVar("T", bound=node.Node) @@ -145,136 +144,102 @@ def get_data_col_names_from_root(data_root): ][-1] -def find_cols_with_tide_tags( - data_columns: pd.Index | list[str], request: str -) -> list[str]: - request_parts = request.split("__") - - if not (1 <= len(request_parts) <= 4): - raise ValueError( - f"Request '{request}' is malformed. " - f"Use 'name__unit__bloc__sub_bloc' format or a " - f"combination of these tags." - ) - - full_tag_col_map = { - col_name_tag_enrichment(col, get_tags_max_level(data_columns)): col - for col in data_columns - } - - def find_exact_match(search_str, target): - pattern = rf"(?:^|__)(?:{re.escape(search_str)})(?:$|__)" - match = re.search(pattern, target) - return match is not None - - return [ - full_tag_col_map[augmented_col] - for augmented_col in full_tag_col_map.keys() - if all(find_exact_match(part, augmented_col) for part in request_parts) - ] - - -def find_cols_multiple_tag_groups( - data_columns: pd.Index | list[str], request: str -) -> list[str]: - request_parts = request.split("|") - list_to_return = [] - for req in request_parts: - list_to_return.extend(find_cols_with_tide_tags(data_columns, req)) - return list_to_return - - def tide_request( - data_columns: pd.Index | list[str], request: str | pd.Index | list[str] = None + data_columns: pd.Index | list[str], + request: str | list[str] | pd.Index | None = None, ) -> list[str]: """ Select columns by matching structured TIDE-style tags. - Filters column names based on a TIDE-style structured tag syntax. Columns are - expected to use a naming convention with double underscores (`__`) separating - tags. + Column names follow the format: + + name__unit__bloc__sub_bloc + + Tags are separated by double underscores ("__"). Not all levels are required. + + Before matching, column names are automatically enriched to the maximum + tag depth present in `data_columns`. Missing tag levels are filled using + DEFAULT_TAGS, ensuring consistent hierarchical comparison. - A column name can include up to four hierarchical parts: - 'name__unit__bloc__sub_bloc' where each part is optional, but must be separated - with double underscores. + The `request` argument defines tag queries: - The `request` argument allows searching for columns matching one or more - of these parts using full or partial tag patterns. Multiple tag patterns - can be combined using the pipe `|` character to form OR conditions. + - Tags are separated by "__" + - OR conditions are separated by "|" + - Multiple request entries are OR-combined + - Matching is exact per tag part (no substring matching) Parameters ---------- data_columns : pandas.Index or list of str - A collection of column names to filter. Each column name should follow - the TIDE format (e.g., "sensor__°C__bloc1"). + Collection of column names using TIDE-style tagging. - request : str or list of str or pandas.Index, optional - Tag(s) to match against the column names. Each tag string may be: + request : str or list[str] or pandas.Index, optional + Tag query expression(s). Each expression may contain: - - A full structured tag (e.g., "name__°C__bloc2") + - A full tag path (e.g., "name__°C__bloc2") - A partial tag (e.g., "°C", "bloc1") - - A group of tags separated by "|" (e.g., "kWh|°C") + - OR groups separated by "|" (e.g., "kWh|°C") - If None, all columns from `data_columns` are returned. + If None, all columns are returned. Returns ------- - list of str - The list of column names that match any of the provided tag queries. + list[str] + Column names matching at least one request expression. + Order is preserved and duplicates are removed. Notes ----- - - Matching is done per tag part, not substrings. For instance, the query - "bloc1" will match "name__°C__bloc1" but not "bloc11". - - If multiple requests are given, columns are returned if they match - at least one of them (logical OR). - - Tags can include between 1 and 4 parts, split by `__`. + - Matching is performed on enriched tag representations. + - Default tag values (e.g., "OTHER") may be injected during enrichment. + - Matching is exact at tag level, not substring-based. + - Requests may contain between 1 and 4 tag levels. Examples -------- - >>> DF_COLUMNS = [ - ... "name_1__°C__bloc1", - ... "name_1__°C__bloc2", - ... "name_2", - ... "name_2__DIMENSIONLESS__bloc2", - ... "name_3__kWh/m²", - ... "name_5__kWh", - ... "name4__DIMENSIONLESS__bloc4", - ... ] - - >>> tide_request(DF_COLUMNS) - ['name_1__°C__bloc1', 'name_1__°C__bloc2', 'name_2', - 'name_2__DIMENSIONLESS__bloc2', 'name_3__kWh/m²', - 'name_5__kWh', 'name4__DIMENSIONLESS__bloc4'] - >>> tide_request(DF_COLUMNS, "°C") - ['name_1__°C__bloc1', 'name_1__°C__bloc2'] - >>> tide_request(DF_COLUMNS, "kWh|°C") - ['name_5__kWh', 'name_1__°C__bloc1', 'name_1__°C__bloc2'] - - >>> # Columns are not selected twice >>> tide_request(DF_COLUMNS, ["kWh|°C", "name_5__kWh"]) - ['name_5__kWh', 'name_1__°C__bloc1', 'name_1__°C__bloc2'] """ if request is None: return list(data_columns) - elif isinstance(request, str): + if isinstance(request, str): request = [request] - if not (isinstance(request, pd.Index) or isinstance(request, list)): + if not isinstance(request, (list, pd.Index)): raise ValueError( - "Invalid request. Was expected an instance of str, pd.Index or List[str]" - f"got {type(request)} instead" + f"request must be str, list[str], pd.Index or None, got {type(request)}" ) - list_to_return = [] + max_level = get_tags_max_level(data_columns) + + # Enrich columns once + enriched_map = { + col_name_tag_enrichment(col, max_level): col for col in data_columns + } + + selected = [] + for req in request: - list_to_return.extend(find_cols_multiple_tag_groups(data_columns, req)) + for group in req.split("|"): + group_tags = group.split("__") + + if not (1 <= len(group_tags) <= 4): + raise ValueError( + f"Request '{group}' is malformed. " + "Use up to 4 tags separated by '__'." + ) + + for enriched_name, original in enriched_map.items(): + tags = enriched_name.split("__") + + # Exact per-tag match + if all(tag in tags for tag in group_tags): + selected.append(original) - return list(dict.fromkeys(list_to_return)) + return list(dict.fromkeys(selected)) def data_columns_to_tree(columns: pd.Index | list[str]) -> T: