io.py

# -*- coding: utf-8 -*-
"""io.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1-7WtK2kJBYiaC2LklaEkQIw7_Fgz1y1s
"""

# -*- coding: utf-8 -*-
"""MoE Framework for Anxiety Explainability - Grok & Claude with DDQN, SHAP (Refactored)

Mixture of Experts Framework for Enhanced Explainability of Anxiety States
Pre- and Post-Intervention Across Experimental Groups (Refactored)

This notebook implements a mixture of experts (MoE) framework to enhance the
explainability of anxiety states before and after interventions across
different groups. It combines large language models (LLMs) with causal
inference and a (simplified) DDQN agent for potential action recommendations.

Workflow:
1. Data Loading and Validation: Load synthetic anxiety intervention data, validate its structure, content, and data types. Handle potential errors gracefully.
2. Data Preprocessing: One-hot encode categorical features and scale numerical features.
3. Causal Structure Discovery (Regression-Based):  Use regression to infer potential causal relationships.
4. SHAP Value Calculation: Compute SHAP values to assess feature importance.
5. Data Visualization: Generate KDE, Violin, Parallel Coordinates, Hypergraph, and Correlation Heatmap plots.
6. Statistical Summary: Perform bootstrap analysis and generate summary statistics.
7. DDQN Agent (Simplified):  A simplified DDQN agent is included as a *placeholder* for potential action recommendations based on the analysis.  This is NOT a fully trained or integrated RL component.
8. LLM Insights Report: Synthesize findings using Grok, Claude, and Grok-Enhanced, emphasizing explainability and the combination of different analysis techniques.

Keywords: Mixture of Experts, Anxiety States, Pre-Intervention Anxiety, Post-Intervention Anxiety, Causal Inference, LLMs, Explainability, Group Dynamics, DDQN, SHAP, Data Visualization, Regression-Based Causal Discovery
"""

# Suppress warnings (use with caution in production code)
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning, module="plotly")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import shap
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
from io import StringIO
from itertools import combinations  # For generating combinations of features
import plotly.express as px
from scipy.stats import bootstrap
from matplotlib.colors import LinearSegmentedColormap

# Google Colab specific - Mounting Google Drive
try:
    from google.colab import drive
    drive.mount("/content/drive")
    COLAB_ENV = True
except ImportError:
    COLAB_ENV = False
    print("Not running in Google Colab environment.")

# Constants
OUTPUT_PATH = "./output_anxiety_moe/" if not COLAB_ENV else "/content/drive/MyDrive/output_anxiety_moe/"
PARTICIPANT_ID_COLUMN = "participant_id"
GROUP_COLUMN = "group"  # Original group column name
ANXIETY_PRE_COLUMN = "anxiety_pre"
ANXIETY_POST_COLUMN = "anxiety_post"
MODEL_GROK_NAME = "grok-base"
MODEL_CLAUDE_NAME = "claude-3.7-sonnet"
MODEL_GROK_ENHANCED_NAME = "grok-enhanced"
LINE_WIDTH = 2.5
BOOTSTRAP_RESAMPLES = 500

# IMPORTANT SECURITY WARNING (Placeholder keys)
GROK_API_KEY = "YOUR_API"  # Placeholder
CLAUDE_API_KEY = "YOUR_API" # Placeholder

def create_output_directory(path):
    """Creates the output directory if it doesn't exist, handling errors."""
    try:
        os.makedirs(path, exist_ok=True)
        return True
    except OSError as e:
        print(f"Error creating output directory: {e}")
        return False

def load_data_from_synthetic_string(csv_string):
    """Loads data from a CSV string, handling errors."""
    try:
        csv_file = StringIO(csv_string)
        return pd.read_csv(csv_file)
    except pd.errors.ParserError as e:
        print(f"Error parsing CSV data: {e}")
        return None
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

def validate_dataframe(df, required_columns):
    """Validates the DataFrame: checks for missing columns, non-numeric data,
    duplicate participant IDs, valid group labels, and plausible anxiety ranges.
    Returns True if valid, False otherwise.
    """
    if df is None:
        print("Error: DataFrame is None. Cannot validate.")
        return False

    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        print(f"Error: Missing columns: {missing_columns}")
        return False

    for col in required_columns:
        if col != PARTICIPANT_ID_COLUMN and col != GROUP_COLUMN:
            if not pd.api.types.is_numeric_dtype(df[col]):
                print(f"Error: Non-numeric values found in column: {col}")
                return False

    if df[PARTICIPANT_ID_COLUMN].duplicated().any():
        print("Error: Duplicate participant IDs found.")
        return False

    valid_groups = ["Group 1", "Group 2"]  # Define valid group names
    invalid_groups = df[~df[GROUP_COLUMN].isin(valid_groups)][GROUP_COLUMN].unique()
    if invalid_groups.size > 0:
        print(f"Error: Invalid group labels found: {invalid_groups}")
        return False

    for col in [ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN]:
        if df[col].min() < 0 or df[col].max() > 10: # Assuming anxiety is on 0-10 scale
            print(f"Error: Anxiety scores in column '{col}' are out of range (0-10).")
            return False

    return True

def analyze_text_with_llm(text, model_name):
    """Simulates text analysis by different language models."""
    text_lower = text.lower()  # Normalize text to lowercase

    if model_name == MODEL_GROK_NAME:
        # Simulation for Grok-base
        if "causal graph" in text_lower:
            return "Grok-base Analysis: The causal graph suggests potential influences between pre-anxiety, post-anxiety, and group. Further investigation is needed to confirm these relationships."
        elif "shap summary" in text_lower:
            return "Grok-base Analysis: SHAP values indicate the relative importance of group and pre-anxiety in predicting post-anxiety. Pre-anxiety is a strong predictor."
        elif "kde plot" in text_lower or "violin plot" in text_lower:
            return "Grok-base Analysis: KDE and Violin plots show distributions of pre-anxiety and post-anxiety, suggesting variations between groups."
        elif "hypergraph" in text_lower:
            return "Grok-base Analysis: The hypergraph highlights clusters of influential participants based on pre-anxiety and post-anxiety."
        elif "summary statistics" in text_lower:
            return "Grok-base Analysis: Summary statistics provide a robust view of participants' average anxiety."
        elif "parallel coordinates" in text_lower:
            return "Grok-base Analysis: The parallel coordinates plot effectively shows how anxiety levels change from pre to post intervention across different groups, highlighting group-specific patterns and trends."
        elif "heatmap" in text_lower:
            return "Grok-base Analysis: The correlation heatmap reveals potential causal factors by highlighting the strength of relationships between variables."
        else:
            return (
                f"General Grok-base analysis on: '{text}'. Suggests deeper investigation into metrics."
            )

    elif model_name == MODEL_CLAUDE_NAME:
        # Simulation for Claude 3.7 Sonnet
        if "causal graph" in text_lower:
            return "Claude 3.7 Sonnet Interpretation: The causal graph shows potential dependencies and possible confounding factors between participant anxiety metrics."
        elif "shap summary" in text_lower:
            return "Claude 3.7 Sonnet Interpretation: SHAP values reveal how each variable contributes to post-anxiety, highlighting variability among participants."
        elif "kde plot" in text_lower:
            return "Claude 3.7 Sonnet Interpretation: The KDE plot compares distributions of pre-anxiety and post-anxiety, suggesting visible correlations."
        elif "violin plot" in text_lower:
            return "Claude 3.7 Sonnet Interpretation: The Violin plot details the shapes of anxiety distributions across groups."
        elif "hypergraph" in text_lower:
            return "Claude 3.7 Sonnet Interpretation: The hypergraph identifies emerging communities of participants with high pre and post anxiety."
        elif "summary statistics" in text_lower:
            return "Claude 3.7 Sonnet Interpretation: Statistics provide a quantitative basis for understanding the overall impact of the intervention on anxiety levels."
        elif "parallel coordinates" in text_lower:
            return "Claude 3.7 Sonnet Interpretation: The parallel coordinates visualization clearly shows the individual trajectories from pre to post anxiety states, revealing group-specific responses to the intervention."
        elif "heatmap" in text_lower:
            return "Claude 3.7 Sonnet Interpretation: The correlation heatmap provides quantitative evidence for relationships between variables, supporting causal inference through statistical associations."
        else:
            return (
                f"General Claude 3.7 Sonnet analysis on: '{text}'. Suggests focus on actionable data patterns."
            )

    elif model_name == MODEL_GROK_ENHANCED_NAME:
        # Simulation for Grok-Enhanced
        if "causal graph" in text_lower:
            return "Grok-Enhanced Analysis: The causal graph reveals subtle connections between pre-anxiety and post-anxiety, with group as a key moderator."
        elif "shap summary" in text_lower:
            return "Grok-Enhanced Analysis: SHAP values show that pre-anxiety dominates post-anxiety prediction, but group adds important nuances."
        elif "kde plot" in text_lower:
            return "Grok-Enhanced Analysis: The KDE plot highlights anxiety peaks in certain groups, suggesting significant trends."
        elif "violin plot" in text_lower:
            return "Grok-Enhanced Analysis: The Violin plot displays variations in anxiety distribution, with long tails indicating influential outliers."
        elif "hypergraph" in text_lower:
            return "Grok-Enhanced Analysis: The hypergraph connects participants in influence networks, revealing emerging collaboration patterns."
        elif "summary statistics" in text_lower:
            return "Grok-Enhanced Analysis: Summary statistics point to a stable average anxiety, but with high variability among participants."
        elif "parallel coordinates" in text_lower:
            return "Grok-Enhanced Analysis: The parallel coordinates plot reveals detailed individual response patterns to the intervention, with clear group-specific trajectories and notable outliers worthy of further investigation."
        elif "heatmap" in text_lower:
            return "Grok-Enhanced Analysis: The correlation heatmap uncovers complex interdependencies in the data that suggest potential root causes and mediating factors in anxiety reduction."
        else:
            return (
                f"General Grok-Enhanced analysis on: '{text}'. Recommends exploring interdisciplinary connections in the data."
            )
    else:
        return f"Model '{model_name}' not supported in this simulation."

class DDQNAgent:
    """
    A simplified DDQN agent for demonstration purposes.  This is a *placeholder*
    and would need significant adaptation for a real-world application.
    """
    def __init__(self, state_dim, action_dim):
        self.state_dim = state_dim
        self.action_dim = action_dim
        # Initialize Q-network and target network with random values (for demonstration)
        self.q_network = np.random.rand(state_dim, action_dim)
        self.target_network = np.copy(self.q_network)

    def act(self, state, epsilon=0.01):
        """Epsilon-greedy action selection."""
        if np.random.rand() < epsilon:
            return np.random.choice(self.action_dim)  # Explore
        else:
            return np.argmax(self.q_network[state])  # Exploit

    def learn(self, batch, gamma=0.99, learning_rate=0.1):
        """Placeholder learning function.  A real implementation would update the Q-network."""
        for state, action, reward, next_state in batch:
            # Simplified DDQN update (replace with actual update rule)
            q_target = reward + gamma * np.max(self.target_network[next_state])
            q_predict = self.q_network[state, action]
            self.q_network[state, action] += learning_rate * (q_target - q_predict)

    def update_target_network(self):
        """Placeholder target network update."""
        self.target_network = np.copy(self.q_network)

def scale_data(df, columns):
    """Scales specified columns using MinMaxScaler, handling errors."""
    try:
        scaler = MinMaxScaler()
        df[columns] = scaler.fit_transform(df[columns])
        return df
    except ValueError as e:
        print(f"Error during data scaling: {e}")
        return None  # Or raise the exception
    except Exception as e:
        print(f"An unexpected error occurred during scaling: {e}")
        return None

def discover_causal_structure_with_regression(df, variables, output_path):
    """
    Alternative approach to discover causal structure using regression-based methods.
    This function doesn't rely on the causal-learn library.
    """
    try:
        # Create a graph to represent causal relationships
        G = nx.DiGraph()
        G.add_nodes_from(variables)

        # Dictionary to store relationship strengths
        edge_strengths = {}

        # Use regression to test possible causal relationships
        # Testing if each pair of variables might have a causal relationship
        for target in variables:
            if target == ANXIETY_POST_COLUMN:  # Focus on predicting post-anxiety
                # Potential predictors are all variables except the target
                predictors = [var for var in variables if var != target]

                # Fit a linear regression model
                X = df[predictors]
                y = df[target]
                model = LinearRegression()
                model.fit(X, y)

                # Evaluate model performance
                y_pred = model.predict(X)
                mse = mean_squared_error(y, y_pred)

                # Get coefficients
                coeffs = dict(zip(predictors, model.coef_))

                # Add edges for significant predictors (arbitrary threshold)
                for predictor, coef in coeffs.items():
                    if abs(coef) > 0.1:  # Arbitrary threshold for significance
                        G.add_edge(predictor, target, weight=abs(coef))
                        edge_strengths[(predictor, target)] = abs(coef)

        # Draw and save the causal graph
        plt.figure(figsize=(10, 8))
        plt.style.use('dark_background') # Set dark background
        pos = nx.spring_layout(G)

        # Draw nodes
        nx.draw_networkx_nodes(G, pos, node_color="#00FFFF", node_size=700, alpha=0.8)

        # Draw edges with varying thickness based on weight
        edge_weights = [G[u][v]['weight'] * 2 for u, v in G.edges()]
        nx.draw_networkx_edges(G, pos, width=edge_weights, edge_color="#FF00FF", alpha=0.7)

        # Draw labels
        nx.draw_networkx_labels(G, pos, font_color="white", font_size=12)

        # Add edge labels (weights)
        edge_labels = {(u, v): f"{G[u][v]['weight']:.2f}" for u, v in G.edges()}
        nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_color="yellow")

        plt.title("Regression-Based Causal Graph", color="white")
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(os.path.join(output_path, "causal_graph.png"))
        plt.close()

        # Generate text description of edges
        edges_info = [
            f"  Node {u} -> Node {v}: Strength {G[u][v]['weight']:.3f}"
            for u, v in G.edges()
        ]

        edge_info_str = "\n".join(edges_info)
        print("Edges from Causal Graph:\n" + edge_info_str)
        return edge_info_str

    except Exception as e:
        print(f"Error during regression-based causal structure discovery: {e}")
        return None

def calculate_shap_values(df, feature_columns, target_column, output_path):
    """Calculates and visualizes SHAP values, handling one-hot encoded columns."""
    try:
        # Identify one-hot encoded group columns
        encoded_group_cols = [col for col in df.columns if col.startswith(f"{GROUP_COLUMN}_")]

        # Combine encoded group columns with other feature columns (excluding the original GROUP_COLUMN)
        feature_cols_encoded = encoded_group_cols + [
            col for col in feature_columns if col != GROUP_COLUMN and col not in encoded_group_cols
        ]

        # Train a RandomForestRegressor model
        model_rf = RandomForestRegressor(random_state=42).fit(df[feature_cols_encoded], df[target_column])

        # Create a TreeExplainer and calculate SHAP values
        explainer = shap.TreeExplainer(model_rf)
        shap_values = explainer.shap_values(df[feature_cols_encoded])

        # Create and save the SHAP summary plot
        plt.figure(figsize=(10, 8))
        plt.style.use('dark_background')
        shap.summary_plot(shap_values, df[feature_cols_encoded], show=False, color_bar=True)
        plt.tight_layout()
        plt.savefig(os.path.join(output_path, "shap_summary.png"))
        plt.close()

        return f"SHAP summary for features {feature_cols_encoded} predicting {target_column}"

    except Exception as e:
        print(f"Error during SHAP value calculation: {e}")
        return "Error: SHAP value calculation failed."

def create_kde_plot(df, column1, column2, output_path, colors):
    """Creates a KDE plot, handling potential errors."""
    try:
        plt.figure(figsize=(10, 6))
        plt.style.use('dark_background')
        sns.kdeplot(data=df[column1], color=colors[0], label=column1.capitalize(), linewidth=LINE_WIDTH)
        sns.kdeplot(data=df[column2], color=colors[1], label=column2.capitalize(), linewidth=LINE_WIDTH)
        plt.title('KDE Plot of Anxiety Levels', color='white')
        plt.legend(facecolor='black', edgecolor='white', labelcolor='white')
        plt.savefig(os.path.join(output_path, 'kde_plot.png'))
        plt.close()
        return f"KDE plot visualizing distributions of {column1} and {column2}"
    except KeyError as e:
        print(f"Error generating KDE plot: Column not found: {e}")
        return "Error: KDE plot generation failed.  Missing column."
    except RuntimeError as e:
        print(f"Error generating KDE plot: {e}")
        return "Error: KDE plot generation failed."
    except Exception as e:
        print(f"An unexpected error occurred while creating KDE plot: {e}")
        return "Error: KDE plot generation failed."

def create_violin_plot(df, group_column, y_column, output_path, colors):
    """Creates a violin plot, handling potential errors and one-hot encoded groups."""
    try:
        plt.figure(figsize=(10, 6))
        plt.style.use('dark_background')

        # Handling group column when already one-hot encoded
        encoded_group_cols = [col for col in df.columns if col.startswith(f"{group_column}_")]

        if len(encoded_group_cols) > 0:
            # Create a temporary column for group membership
            df['temp_group'] = np.nan
            for col in encoded_group_cols:
                group_name = col.split('_', 1)[1]  # Extract group name from encoded column
                df.loc[df[col] == 1, 'temp_group'] = group_name

            # Create violin plot
            sns.violinplot(data=df, x='temp_group', y=y_column, palette=colors[:len(encoded_group_cols)], linewidth=LINE_WIDTH)
            # Remove the temp group after plotting
            df.drop('temp_group', axis=1, inplace=True)
        else:
            # If group column is already categorical
            sns.violinplot(data=df, x=group_column, y=y_column, palette=colors, linewidth=LINE_WIDTH)

        plt.title('Violin Plot of Anxiety Distribution by Group', color='white')
        plt.savefig(os.path.join(output_path, 'violin_plot.png'))
        plt.close()
        return f"Violin plot showing {y_column} across groups"

    except KeyError as e:
        print(f"Error generating violin plot: Column not found: {e}")
        return "Error: Violin plot generation failed. Missing column."
    except RuntimeError as e:
        print(f"Error generating violin plot: {e}")
        return "Error: Violin plot generation failed."
    except Exception as e:
        print(f"An unexpected error occurred while creating violin plot: {e}")
        return "Error: Violin plot generation failed."

def create_parallel_coordinates_plot(df, group_column, anxiety_pre_column, anxiety_post_column, output_path, colors):
    """Creates a parallel coordinates plot and returns a text description."""
    try:
        # Prepare data: Need original group names, not one-hot encoded.
        plot_df = df[[group_column, anxiety_pre_column, anxiety_post_column]].copy()

        # Create a color map for groups
        unique_groups = plot_df[group_column].unique()
        group_color_map = {group: colors[i % len(colors)] for i, group in enumerate(unique_groups)}

        # Map group names to colors
        plot_df['color'] = plot_df[group_column].map(group_color_map)

        # Create the parallel coordinates plot
        fig = px.parallel_coordinates(
            plot_df,
            color='color',  # Use the new 'color' column
            dimensions=[anxiety_pre_column, anxiety_post_column],
            title="Anxiety Levels: Pre- vs Post-Intervention by Group",
            color_continuous_scale=px.colors.sequential.Viridis, # Using Viridis
        )

        # Customize appearance
        fig.update_layout(
            plot_bgcolor='black',
            paper_bgcolor='black',
            font_color='white',
            title_font_size=16,
        )

        # Instead of saving the image, create a text description
        description = (
            f"Parallel Coordinates Plot Description:\n"
            f"This plot visualizes the change in anxiety levels from pre-intervention ({anxiety_pre_column}) "
            f"to post-intervention ({anxiety_post_column}) for each participant.\n"
            f"Each line represents a participant, and the color of the line indicates their group membership.\n"
            f"The x-axis shows the anxiety levels, and the y-axis represents the two time points (pre and post).\n"
            f"The plot allows for easy comparison of individual trajectories and group-level trends.\n\n"
            f"Group Color Mapping:\n"
        )
        for group, color in group_color_map.items():
            description += f"- {group}: {color}\n"

        # Save the description to a text file
        with open(os.path.join(output_path, "parallel_coordinates_description.txt"), "w") as f:
            f.write(description)

        return description

    except KeyError as e:
        print(f"Error generating parallel coordinates plot: Column not found: {e}")
        return "Error: Parallel coordinates plot generation failed. Missing column."
    except Exception as e:
        print(f"Error generating parallel coordinates plot: {e}")
        return "Error: Parallel coordinates plot generation failed."

def create_correlation_heatmap(df, output_path, colors):
    """Creates a correlation heatmap, handling errors."""
    try:
        plt.figure(figsize=(10, 8))
        plt.style.use('dark_background')

        # Calculate correlation matrix, only including numeric columns
        numeric_df = df.select_dtypes(include=[np.number])
        corr_matrix = numeric_df.corr()

        # Create mask for upper triangle
        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

        # Custom colormap with neon colors
        custom_colors = ['#FF00FF', '#FFFFFF', '#00FFFF']  # Neon purple, white, neon cyan
        colormap = LinearSegmentedColormap.from_list('custom', custom_colors, N=256)

        # Create heatmap
        sns.heatmap(
            corr_matrix,
            mask=mask,
            cmap=colormap,
            vmin=-1,
            vmax=1,
            center=0,
            square=True,
            linewidths=0.5,
            cbar_kws={"shrink": 0.8},
            annot=True,
            fmt=".2f",
            annot_kws={"size": 10}
        )

        plt.title("Correlation Heatmap for Root Cause Analysis", fontsize=16, color="white")
        plt.tight_layout()
        plt.savefig(os.path.join(output_path, "correlation_heatmap.png"))
        plt.close()

        return "Correlation heatmap showing relationships between variables for causal inference."

    except Exception as e:
        print(f"Error generating correlation heatmap: {e}")
        return "Error: Correlation heatmap generation failed."

def visualize_hypergraph(df, anxiety_pre_column, anxiety_post_column, output_path, colors):
    """Creates a hypergraph, handling errors."""
    try:
        G = nx.Graph()

        participant_ids = df[PARTICIPANT_ID_COLUMN].tolist()
        G.add_nodes_from(participant_ids, bipartite=0)

        feature_sets = {
            "anxiety_pre_high": df[PARTICIPANT_ID_COLUMN][
                df[anxiety_pre_column] > df[anxiety_pre_column].mean()
            ].tolist(),
            "anxiety_post_high": df[PARTICIPANT_ID_COLUMN][
                df[anxiety_post_column] > df[anxiety_post_column].mean()
            ].tolist(),
        }

        feature_nodes = list(feature_sets.keys())
        G.add_nodes_from(feature_nodes, bipartite=1)

        for feature, participants in feature_sets.items():
            for participant in participants:
                G.add_edge(participant, feature)

        plt.figure(figsize=(12, 10))
        plt.style.use('dark_background')

        pos = nx.bipartite_layout(G, participant_ids)
        color_map = [colors[0] if node in participant_ids else colors[1] for node in G]

        nx.draw(
            G,
            pos,
            with_labels=True,
            node_color=color_map,
            font_color="white",
            edge_color="gray",
            width=LINE_WIDTH,
            node_size=700,
            font_size=10,
            alpha=0.9
        )
        plt.title("Hypergraph Representation of Anxiety Patterns", fontsize=16, color="white")
        plt.tight_layout()
        plt.savefig(os.path.join(output_path, "hypergraph.png"))
        plt.close()
        return "Hypergraph visualizing participant relationships based on anxiety pre and post intervention"

    except Exception as e:
        print(f"Error creating hypergraph: {e}")
        return "Error in hypergraph visualization"

def perform_bootstrap(data, statistic, n_resamples=BOOTSTRAP_RESAMPLES):
    """Performs bootstrap analysis, handling errors."""
    try:
        bootstrap_result = bootstrap((data,), statistic, n_resamples=n_resamples, method="percentile", random_state=42) # Added random_state
        return bootstrap_result.confidence_interval
    except Exception as e:
        print(f"Error during bootstrap analysis: {e}")
        return (None, None)

def save_summary(df, bootstrap_ci, output_path):
    """Saves summary statistics, handling errors."""
    try:
        summary_text = df.describe().to_string() + f"\nBootstrap CI for anxiety_post mean: {bootstrap_ci}"
        with open(os.path.join(output_path, "summary.txt"), "w") as f:
            f.write(summary_text)
        return summary_text
    except Exception as e:
        print(f"Error saving summary statistics: {e}")
        return "Error: Could not save summary statistics."

def generate_insights_report(
    summary_stats_text,
    causal_edges_info,
    shap_analysis_info,
    kde_plot_desc,
    violin_plot_desc,
    parallel_coords_desc,
    hypergraph_desc,
    correlation_heatmap_desc,
    output_path,
):
    """Generates an insights report using analyses from Grok, Claude, and Grok-Enhanced, handling errors."""
    try:
        grok_insights = (
            analyze_text_with_llm(
                f"Analyze summary statistics:\n{summary_stats_text}", MODEL_GROK_NAME
            )
            + "\n\n"
            + (
                analyze_text_with_llm(
                    f"Interpret causal graph edges:\n{causal_edges_info}", MODEL_GROK_NAME
                )
                + "\n\n"
                if causal_edges_info
                else ""
            )
        )

        claude_insights = (
            analyze_text_with_llm(f"Interpret KDE plot: {kde_plot_desc}", MODEL_CLAUDE_NAME)
            + "\n\n"
            + analyze_text_with_llm(f"Interpret Violin plot: {violin_plot_desc}", MODEL_CLAUDE_NAME)
            + "\n\n"
            + analyze_text_with_llm(f"Interpret Parallel Coordinates: {parallel_coords_desc}", MODEL_CLAUDE_NAME)
            + "\n\n"
            + analyze_text_with_llm(f"Interpret Hypergraph: {hypergraph_desc}", MODEL_CLAUDE_NAME)
            + "\n\n"
            + analyze_text_with_llm(f"Interpret Correlation Heatmap: {correlation_heatmap_desc}", MODEL_CLAUDE_NAME)
            + "\n\n"
            + analyze_text_with_llm(f"Explain SHAP summary: {shap_analysis_info}", MODEL_CLAUDE_NAME)
            + "\n\n"
        )

        grok_enhanced_insights = (
            analyze_text_with_llm(f"Analyze correlation heatmap for root causes: {correlation_heatmap_desc}", MODEL_GROK_ENHANCED_NAME)
            + "\n\n"
            + analyze_text_with_llm(f"Interpret parallel coordinates plot in depth: {parallel_coords_desc}", MODEL_GROK_ENHANCED_NAME)
        )

        combined_insights = f"""
    Combined Insights Report: Anxiety Intervention Analysis

    Grok-base Analysis:
    {grok_insights}

    Claude 3.7 Sonnet Analysis:
    {claude_insights}

    Grok-Enhanced Analysis:
    {grok_enhanced_insights}

    Synthesized Summary:
    The analyses from our Mixture of Experts framework provides complementary insights into the anxiety intervention dataset. Grok-base highlights statistical robustness and potential causal relationships, noting the strong influence of pre-anxiety. Claude 3.7 Sonnet focuses on visual patterns and feature importance, observing variations between groups and the shift towards lower anxiety levels post-intervention. Grok-Enhanced adds nuanced interpretation of complex relationships, emphasizing the dominance of pre-anxiety with group-specific contextual nuances. The combined insights suggest a multifaceted impact influenced by the intervention, pre-intervention anxiety, and group dynamics, with significant variations and potential for deeper exploration of influential outliers and collaborative networks.

    The correlation heatmap combined with regression-based causal inference offers a more robust approach to identifying root causes, revealing that [key finding from the analysis] is likely the primary factor influencing post-intervention anxiety levels. The parallel coordinates visualization further supports this by showing clear patterns in individual trajectories across groups.
    """
        with open(os.path.join(output_path, "insights.txt"), "w") as f:
            f.write(combined_insights)
        return combined_insights

    except Exception as e:
        print(f"Error generating insights report: {e}")
        return "Error: Could not generate insights report."

def main():
    """Main function to execute the analysis."""
    # 1. Data Loading and Validation
    if not create_output_directory(OUTPUT_PATH):
        return  # Exit if directory creation fails

    synthetic_data = """
participant_id,group,anxiety_pre,anxiety_post,extra_feature1,extra_feature2
1,Group 1,8,6,0.5,12
    2,Group 2,6,4,0.3,15
    3,Group 1,9,7,0.6,10
    4,Group 2,7,5,0.4,18
    5,Group 1,5,3,0.2,11
    6,Group 2,8,6,0.5,14
    7,Group 1,7,5,0.4,9
    8,Group 2,6,4,0.3,20
    9,Group 1,9,8,0.7,13
    10,Group 2,5,3,0.2,16
    """
    df = load_data_from_synthetic_string(synthetic_data)
    required_columns = [
        PARTICIPANT_ID_COLUMN,
        GROUP_COLUMN,
        ANXIETY_PRE_COLUMN,
        ANXIETY_POST_COLUMN,
        "extra_feature1",
        "extra_feature2",
    ]
    if not validate_dataframe(df, required_columns):
        return  # Exit if validation fails

    # 2. Data Preprocessing
    df = pd.get_dummies(df, columns=[GROUP_COLUMN], prefix=GROUP_COLUMN)  # One-hot encode
    df = scale_data(df, [ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN, "extra_feature1", "extra_feature2"])
    if df is None:
        return # Exit if scaling fails

    # 3. Causal Structure Discovery
    variables = [ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN, "extra_feature1", "extra_feature2"] + [col for col in df.columns if col.startswith(f"{GROUP_COLUMN}_")]
    causal_edges_info = discover_causal_structure_with_regression(df, variables, OUTPUT_PATH)
    if causal_edges_info is None:
        print("Causal structure discovery failed.  Continuing without causal graph.")

    # 4. SHAP Value Calculation
    feature_columns = [ANXIETY_PRE_COLUMN, "extra_feature1", "extra_feature2", GROUP_COLUMN]
    shap_analysis_info = calculate_shap_values(df, feature_columns, ANXIETY_POST_COLUMN, OUTPUT_PATH)

    # 5. Data Visualization
    colors = ["#00FFFF", "#FF00FF", "#00FF00"]  # Neon cyan, neon purple, neon green

    kde_plot_desc = create_kde_plot(df, ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN, OUTPUT_PATH, colors)
    violin_plot_desc = create_violin_plot(df, GROUP_COLUMN, ANXIETY_POST_COLUMN, OUTPUT_PATH, colors)
    parallel_coords_desc = create_parallel_coordinates_plot(df, GROUP_COLUMN, ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN, OUTPUT_PATH, colors)
    hypergraph_desc = visualize_hypergraph(df, ANXIETY_PRE_COLUMN, ANXIETY_POST_COLUMN, OUTPUT_PATH, colors)
    correlation_heatmap_desc = create_correlation_heatmap(df, OUTPUT_PATH, colors)

    # 6. Statistical Summary
    bootstrap_ci = perform_bootstrap(df[ANXIETY_POST_COLUMN], np.mean)
    summary_stats_text = save_summary(df, bootstrap_ci, OUTPUT_PATH)

    # 7. DDQN Agent (Simplified)
    state_dim = len(variables) -1 # Exclude target
    action_dim = 3  # Example: Increase, Decrease, Maintain intervention
    agent = DDQNAgent(state_dim, action_dim)
    # Placeholder training data (replace with actual data)
    batch = [
        (0, 0, 1, 1),  # Example: state 0, action 0, reward 1, next state 1
        (1, 1, 2, 2),
    ]
    agent.learn(batch)
    agent.update_target_network()
    print("Simplified DDQN agent initialized and placeholder training completed.")

    # 8. LLM Insights Report
    insights_report = generate_insights_report(
        summary_stats_text,
        causal_edges_info,
        shap_analysis_info,
        kde_plot_desc,
        violin_plot_desc,
        parallel_coords_desc,
        hypergraph_desc,
        correlation_heatmap_desc,
        OUTPUT_PATH,
    )
    print(f"Insights report generated:\n{insights_report}")

if __name__ == "__main__":
    main()