-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcolumn_analyzer.py
More file actions
117 lines (91 loc) · 4.36 KB
/
column_analyzer.py
File metadata and controls
117 lines (91 loc) · 4.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import pandas as pd
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
import json
from typing import Dict, List, Any
class ColumnAnalyzer:
"""Analyzes CSV columns and generates descriptions using LLM"""
def __init__(self):
self.llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
def analyze_columns(self, df: pd.DataFrame, sample_size: int = 10) -> Dict[str, str]:
"""
Analyze each column and generate descriptions using LLM
Args:
df: The pandas DataFrame to analyze
sample_size: Number of sample values to send to LLM for each column
Returns:
Dictionary mapping column names to their descriptions
"""
column_descriptions = {}
# Get sample data for analysis
sample_df = df.head(sample_size)
for column in df.columns:
try:
# Get column info
column_info = {
'name': column,
'dtype': str(df[column].dtype),
'sample_values': sample_df[column].dropna().head(5).tolist(),
'null_count': df[column].isnull().sum(),
'unique_count': df[column].nunique(),
'total_rows': len(df)
}
# Generate description using LLM
description = self._generate_column_description(column_info)
column_descriptions[column] = description
except Exception as e:
print(f"Error analyzing column {column}: {str(e)}")
column_descriptions[column] = f"Column {column} (analysis failed)"
return column_descriptions
def _generate_column_description(self, column_info: Dict[str, Any]) -> str:
"""Generate a description for a single column using LLM"""
prompt_template = """
Analyze the following column information and provide a clear, concise description of what this column represents.
Column Information:
- Name: {column_name}
- Data Type: {dtype}
- Sample Values: {sample_values}
- Null Values: {null_count} out of {total_rows}
- Unique Values: {unique_count}
Please provide a brief description (1-2 sentences) of what this column represents based on the sample data and column name.
Focus on the business meaning and data content, not technical details.
"""
prompt = PromptTemplate(
input_variables=["column_name", "dtype", "sample_values", "null_count", "total_rows", "unique_count"],
template=prompt_template
)
try:
response = self.llm.invoke(prompt.format(
column_name=column_info['name'],
dtype=column_info['dtype'],
sample_values=column_info['sample_values'],
null_count=column_info['null_count'],
total_rows=column_info['total_rows'],
unique_count=column_info['unique_count']
))
return response.content.strip()
except Exception as e:
print(f"Error generating description for {column_info['name']}: {str(e)}")
return f"Column {column_info['name']} with {column_info['dtype']} data type"
def create_dataset_context(self, column_descriptions: Dict[str, str], df: pd.DataFrame) -> str:
"""
Create a context string with column descriptions and basic dataset info
Args:
column_descriptions: Dictionary of column descriptions
df: The original DataFrame
Returns:
Formatted context string for the agent
"""
context = f"""
Dataset Overview:
- Total rows: {len(df)}
- Total columns: {len(df.columns)}
Column Descriptions:
"""
for column, description in column_descriptions.items():
context += f"\n- {column}: {description}"
context += f"""
You have access to a DataFrame named 'df' with the above structure.
You can use pandas operations to analyze this data, but remember you're working with the full dataset.
"""
return context