daxa-ai · Raj725 · Sep 26, 2024
diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/.env.sample b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/.env.sample
@@ -5,6 +5,7 @@ OPENAI_API_KEY=<YOUR OPENAI API KEY>
 O365_CLIENT_ID=<YOUR APPLICATION (CLIENT) ID>
 O365_CLIENT_SECRET=<YOUR CLIENT SECRET>
 O365_TENANT_ID=<YOUR TENANT ID>
+SHAREPOINT_SITE_URL=<YOUR SHAREPOINT SITE URL>
 
 # Pebblo configuration
 PEBBLO_CLOUD_URL=<PEBBLO CLOUD URL>

diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/README.md b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/README.md
@@ -3,7 +3,7 @@
 This solution uses:
 
 - PostgreSQL 15.7
-- langchain-community from daxa-ai/langchain branch(https://github.com/daxa-ai/langchain/tree/daxa_3.1)
+- langchain-community 0.2.9
 - LangChain Microsoft Sharepoint loader. See https://python.langchain.com/v0.2/docs/integrations/document_loaders/microsoft_sharepoint for details on specific steps required to be completed in Microsoft Office 365 and Azure Portal.
 
 ### Instructions
@@ -42,6 +42,7 @@ OPENAI_API_KEY=<YOUR OPENAI API KEY>
 O365_CLIENT_ID=<YOUR APPLICATION (CLIENT) ID>
 O365_CLIENT_SECRET=<YOUR CLIENT SECRET>
 O365_TENANT_ID=<YOUR TENANT ID>
+SHAREPOINT_SITE_URL=<YOUR SHAREPOINT SITE URL>
 
 # Postgres configuration
 PG_CONNECTION_STRING = "postgresql://<USERNAME>:<PASSWORD>@<HOST>:<PORT>/<DATABASE-NAME>"

diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/msgraph_api_auth.py b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/msgraph_api_auth.py
@@ -127,6 +127,88 @@ def get_access_token(self):
         else:
             return response.json()["access_token"]
 
+    @staticmethod
+    def format_site_url(site_url: str):
+        """
+        Formats the site URL to include the colon(:) in the URL as required by the Microsoft Graph API.
+        Example:
+        1. Default site URL:
+            input: https://<tenant-name>.sharepoint.com/
+            output: tenant.sharepoint.com
+        2. Custom site URL:
+            input: https://<tenant-name>.sharepoint.com/sites/<site-name>
+            output: tenant.sharepoint.com:/sites/<site-name>
+
+        :param site_url: The original SharePoint site URL.
+        :return: The formatted site URL with a colon after the tenant domain.
+        """
+
+        # Check if the site URL contains the "/sites/" substring and format the URL accordingly
+        if "/sites/" in site_url:
+            parts = site_url.split("/sites/")
+            if parts[0].endswith(":"):
+                # If the URL already contains a colon, use the URL as is
+                formatted_url = site_url
+            else:
+                # Add a colon after the tenant domain
+                formatted_url = f"{parts[0]}:/sites/{parts[1]}"
+        else:
+            formatted_url = site_url
+
+        # Remove the  https:// prefix from the site URL
+        formatted_url = formatted_url.replace("https://", "")
+        return formatted_url
+
+    def get_site_id(self, site_url):
+        """
+        This function retrieves the ID of a SharePoint site using the Microsoft Graph API.
+
+        Parameters:
+        site_url (str): The URL of the SharePoint site.
+
+        Returns:
+        str: The ID of the SharePoint site.
+        """
+        # Format the site URL
+        site_url = self.format_site_url(site_url)
+        # Build URL to request site ID
+        full_url = f"https://graph.microsoft.com/v1.0/sites/{site_url}"
+        response = requests.get(
+            full_url, headers={"Authorization": f"Bearer {self.access_token}"}
+        )
+        site_id = response.json().get("id")  # Return the site ID
+        return site_id
+
+    def get_drive_id(self, site_id):
+        """
+        This function retrieves the IDs and names of all drives associated with a specified SharePoint site.
+
+        Parameters:
+        site_id (str): The ID of the SharePoint site.
+
+        Returns:
+        list: A list of dictionaries. Each dictionary represents a drive on the SharePoint site.
+              Each dictionary contains the following keys:
+              - 'id': The ID of the drive.
+              - 'name': The name of the drive.
+        """
+
+        # Retrieve drive IDs and names associated with a site
+        try:
+            drives_url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives"
+            response = requests.get(drives_url, headers=self.headers)
+            drives = response.json().get("value", [])
+            drive_info = [
+                ({"id": drive["id"], "name": drive["name"]}) for drive in drives
+            ]
+            # print(f"Drive Info: {drive_info}")
+            return drive_info
+        except requests.exceptions.HTTPError as e:
+            print(
+                f"Error while retrieving document library ID from Microsoft Graph API, Error: {e}"
+            )
+            return []
+
 
 if __name__ == "__main__":
     pass
diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/pebblo_saferag.py b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-postgres/pebblo_saferag.py
@@ -154,15 +154,56 @@ def ask(
             query=question, auth_context=auth_context, semantic_context=semantic_context
         )
         # Print chain input in formatted json
-        print(f"\nchain_input: {chain_input.json(indent=4)}")
+        print(f"\nchain_input: {chain_input.model_dump_json(indent=4)}")
         return self.retrieval_chain.invoke(chain_input.dict())
 
 
+def select_drive(drives: list) -> tuple:
+    """
+    Select SharePoint drive from the available drives
+    """
+    if not drives:
+        print("No drives found for the site. Exiting ...")
+        exit(1)
+    elif len(drives) == 1:
+        _drive_id = drives[0].get("id")
+        _drive_name = drives[0].get("name")
+    else:
+        # Select "Documents" as a default drive
+        def_drive_idx = next(
+            (
+                idx
+                for idx, drive in enumerate(drives)
+                if drive.get("name") == "Documents"
+            ),
+            0,
+        )
+        # Select drive
+        # print("Select a drive ...")
+        print("Available drives on the site:")
+        for idx, drive in enumerate(drives):
+            print(f"\t{idx + 1}. {drive.get('name')}")
+
+        # Prompt user for drive index
+        _drive_idx = input(f"Enter drive index (default={def_drive_idx + 1}): ")
+        _drive_idx = int(_drive_idx) - 1 if _drive_idx else def_drive_idx
+        # Validate drive index and select default drive if invalid
+        if _drive_idx < 0 or _drive_idx >= len(drives):
+            print("Error. Invalid drive index! Selecting the default drive ...")
+            _drive_idx = def_drive_idx
+
+        # Get drive info
+        _drive_id = drives[_drive_idx].get("id")
+        _drive_name = drives[_drive_idx].get("name")
+    return _drive_id, _drive_name
+
+
 if __name__ == "__main__":
     input_collection_name = "identity-enabled-rag-sharepoint"
     _client_id = os.environ.get("O365_CLIENT_ID")
     _client_secret = os.environ.get("O365_CLIENT_SECRET")
     _tenant_id = os.environ.get("O365_TENANT_ID")
+    _site_url = os.environ.get("SHAREPOINT_SITE_URL")
 
     print("Please enter the app details to authenticate with Microsoft Graph API ...")
     app_client_id = input(f"App client id ({_client_id}): ") or _client_id
@@ -171,12 +212,41 @@ def ask(
     )
     tenant_id = input(f"Tenant id ({_tenant_id}): ") or _tenant_id
 
-    print("\nPlease enter drive id for loading data...")
-    drive_id = input("Drive id : ")
+    print("\nInitializing SharepointADHelper ...")
+    sharepoint_helper = SharepointADHelper(
+        client_id=app_client_id,
+        client_secret=app_client_secret,
+        tenant_id=tenant_id,
+    )
+    print("SharepointADHelper initialized.\n")
 
+    site_url = (
+        input(f"Enter Sharepoint Site URL (default={_site_url}): ") or _site_url
+    )
+    if not site_url:
+        print("\nSite URL is required. Exiting ...")
+        exit(1)
+    # remove white spaces from the site url
+    site_url = site_url.strip()
+
+    # Get SharePoint Site ID using URL
+    site_id = sharepoint_helper.get_site_id(site_url)
+    print(f"Derived Site Id: {site_id}\n")
+
+    # Get drive info using site id
+    print("Fetching drive info ...\n")
+    drive_info = sharepoint_helper.get_drive_id(site_id)
+    drive_id, drive_name = select_drive(drive_info)
+    print(f"SharePoint Drive name: {drive_name}, Drive Id: {drive_id}\n")
+
+    # Enter Folder path
+    def_folder_path = "documents"
+    folder_path = input(f"Enter folder path (default='{def_folder_path}'): ") or def_folder_path
+
+    # Initialize PebbloSafeRAG app
     rag_app = PebbloSafeRAG(
         drive_id=drive_id,
-        folder_path="/document",
+        folder_path=folder_path,
         collection_name=input_collection_name,
     )
 
@@ -202,11 +272,9 @@ def ask(
 
         prompt = input("Please provide the prompt : ")
 
-        authorized_identities = SharepointADHelper(
-            client_id=app_client_id,
-            client_secret=app_client_secret,
-            tenant_id=tenant_id,
-        ).get_authorized_identities(end_user_email_address)
+        authorized_identities = sharepoint_helper.get_authorized_identities(
+            end_user_email_address
+        )
 
         response = rag_app.ask(
             prompt,

diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/.env.sample b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/.env.sample
@@ -5,6 +5,7 @@ OPENAI_API_KEY=<YOUR OPENAI API KEY>
 O365_CLIENT_ID=<YOUR APPLICATION (CLIENT) ID>
 O365_CLIENT_SECRET=<YOUR CLIENT SECRET>
 O365_TENANT_ID=<YOUR TENANT ID>
+SHAREPOINT_SITE_URL=<YOUR SHAREPOINT SITE URL>
 
 # Pebblo configuration
 PEBBLO_CLOUD_URL=<PEBBLO CLOUD URL>

diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/README.md b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/README.md
@@ -3,7 +3,7 @@
 This solution uses:
 
 - PostgreSQL 15.7
-- langchain-community 0.2.6
+- langchain-community 0.2.9
 - LangChain Microsoft Sharepoint loader. See https://python.langchain.com/v0.2/docs/integrations/document_loaders/microsoft_sharepoint for details on specific steps required to be completed in Microsoft Office 365 and Azure Portal.
 
 ### Instructions
@@ -32,6 +32,7 @@ OPENAI_API_KEY=<YOUR OPENAI API KEY>
 O365_CLIENT_ID=<YOUR APPLICATION (CLIENT) ID>
 O365_CLIENT_SECRET=<YOUR CLIENT SECRET>
 O365_TENANT_ID=<YOUR TENANT ID>
+SHAREPOINT_SITE_URL=<YOUR SHAREPOINT SITE URL>
 
 # Pebblo Cloud configuration (optional)
 PEBBLO_CLOUD_URL=<PEBBLO CLOUD URL>

diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/msgraph_api_auth.py b/pebblo_saferetriever/langchain/pebblo-saferag/sharepoint-qdrant/msgraph_api_auth.py
@@ -127,6 +127,88 @@ def get_access_token(self):
         else:
             return response.json()["access_token"]
 
+    @staticmethod
+    def format_site_url(site_url: str):
+        """
+        Formats the site URL to include the colon(:) in the URL as required by the Microsoft Graph API.
+        Example:
+        1. Default site URL:
+            input: https://<tenant-name>.sharepoint.com/
+            output: tenant.sharepoint.com
+        2. Custom site URL:
+            input: https://<tenant-name>.sharepoint.com/sites/<site-name>
+            output: tenant.sharepoint.com:/sites/<site-name>
+
+        :param site_url: The original SharePoint site URL.
+        :return: The formatted site URL with a colon after the tenant domain.
+        """
+
+        # Check if the site URL contains the "/sites/" substring and format the URL accordingly
+        if "/sites/" in site_url:
+            parts = site_url.split("/sites/")
+            if parts[0].endswith(":"):
+                # If the URL already contains a colon, use the URL as is
+                formatted_url = site_url
+            else:
+                # Add a colon after the tenant domain
+                formatted_url = f"{parts[0]}:/sites/{parts[1]}"
+        else:
+            formatted_url = site_url
+
+        # Remove the  https:// prefix from the site URL
+        formatted_url = formatted_url.replace("https://", "")
+        return formatted_url
+
+    def get_site_id(self, site_url):
+        """
+        This function retrieves the ID of a SharePoint site using the Microsoft Graph API.
+
+        Parameters:
+        site_url (str): The URL of the SharePoint site.
+
+        Returns:
+        str: The ID of the SharePoint site.
+        """
+        # Format the site URL
+        site_url = self.format_site_url(site_url)
+        # Build URL to request site ID
+        full_url = f"https://graph.microsoft.com/v1.0/sites/{site_url}"
+        response = requests.get(
+            full_url, headers={"Authorization": f"Bearer {self.access_token}"}
+        )
+        site_id = response.json().get("id")  # Return the site ID
+        return site_id
+
+    def get_drive_id(self, site_id):
+        """
+        This function retrieves the IDs and names of all drives associated with a specified SharePoint site.
+
+        Parameters:
+        site_id (str): The ID of the SharePoint site.
+
+        Returns:
+        list: A list of dictionaries. Each dictionary represents a drive on the SharePoint site.
+              Each dictionary contains the following keys:
+              - 'id': The ID of the drive.
+              - 'name': The name of the drive.
+        """
+
+        # Retrieve drive IDs and names associated with a site
+        try:
+            drives_url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives"
+            response = requests.get(drives_url, headers=self.headers)
+            drives = response.json().get("value", [])
+            drive_info = [
+                ({"id": drive["id"], "name": drive["name"]}) for drive in drives
+            ]
+            # print(f"Drive Info: {drive_info}")
+            return drive_info
+        except requests.exceptions.HTTPError as e:
+            print(
+                f"Error while retrieving document library ID from Microsoft Graph API, Error: {e}"
+            )
+            return []
+
 
 if __name__ == "__main__":
     pass