Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated samples: SharePoint utility. Drive and Folder Path input #415

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ OPENAI_API_KEY=<YOUR OPENAI API KEY>
O365_CLIENT_ID=<YOUR APPLICATION (CLIENT) ID>
O365_CLIENT_SECRET=<YOUR CLIENT SECRET>
O365_TENANT_ID=<YOUR TENANT ID>
SHAREPOINT_SITE_URL=<YOUR SHAREPOINT SITE URL>

# Pebblo configuration
PEBBLO_CLOUD_URL=<PEBBLO CLOUD URL>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
This solution uses:

- PostgreSQL 15.7
- langchain-community from daxa-ai/langchain branch(https://github.com/daxa-ai/langchain/tree/daxa_3.1)
- langchain-community 0.2.9
- LangChain Microsoft Sharepoint loader. See https://python.langchain.com/v0.2/docs/integrations/document_loaders/microsoft_sharepoint for details on specific steps required to be completed in Microsoft Office 365 and Azure Portal.

### Instructions
Expand Down Expand Up @@ -42,6 +42,7 @@ OPENAI_API_KEY=<YOUR OPENAI API KEY>
O365_CLIENT_ID=<YOUR APPLICATION (CLIENT) ID>
O365_CLIENT_SECRET=<YOUR CLIENT SECRET>
O365_TENANT_ID=<YOUR TENANT ID>
SHAREPOINT_SITE_URL=<YOUR SHAREPOINT SITE URL>

# Postgres configuration
PG_CONNECTION_STRING = "postgresql://<USERNAME>:<PASSWORD>@<HOST>:<PORT>/<DATABASE-NAME>"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,88 @@ def get_access_token(self):
else:
return response.json()["access_token"]

@staticmethod
def format_site_url(site_url: str):
"""
Formats the site URL to include the colon(:) in the URL as required by the Microsoft Graph API.
Example:
1. Default site URL:
input: https://<tenant-name>.sharepoint.com/
output: tenant.sharepoint.com
2. Custom site URL:
input: https://<tenant-name>.sharepoint.com/sites/<site-name>
output: tenant.sharepoint.com:/sites/<site-name>

:param site_url: The original SharePoint site URL.
:return: The formatted site URL with a colon after the tenant domain.
"""

# Check if the site URL contains the "/sites/" substring and format the URL accordingly
if "/sites/" in site_url:
parts = site_url.split("/sites/")
if parts[0].endswith(":"):
# If the URL already contains a colon, use the URL as is
formatted_url = site_url
else:
# Add a colon after the tenant domain
formatted_url = f"{parts[0]}:/sites/{parts[1]}"
else:
formatted_url = site_url

# Remove the https:// prefix from the site URL
formatted_url = formatted_url.replace("https://", "")
return formatted_url

def get_site_id(self, site_url):
"""
This function retrieves the ID of a SharePoint site using the Microsoft Graph API.

Parameters:
site_url (str): The URL of the SharePoint site.

Returns:
str: The ID of the SharePoint site.
"""
# Format the site URL
site_url = self.format_site_url(site_url)
# Build URL to request site ID
full_url = f"https://graph.microsoft.com/v1.0/sites/{site_url}"
response = requests.get(
full_url, headers={"Authorization": f"Bearer {self.access_token}"}
)
site_id = response.json().get("id") # Return the site ID
return site_id

def get_drive_id(self, site_id):
"""
This function retrieves the IDs and names of all drives associated with a specified SharePoint site.

Parameters:
site_id (str): The ID of the SharePoint site.

Returns:
list: A list of dictionaries. Each dictionary represents a drive on the SharePoint site.
Each dictionary contains the following keys:
- 'id': The ID of the drive.
- 'name': The name of the drive.
"""

# Retrieve drive IDs and names associated with a site
try:
drives_url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives"
response = requests.get(drives_url, headers=self.headers)
drives = response.json().get("value", [])
drive_info = [
({"id": drive["id"], "name": drive["name"]}) for drive in drives
]
# print(f"Drive Info: {drive_info}")
return drive_info
except requests.exceptions.HTTPError as e:
print(
f"Error while retrieving document library ID from Microsoft Graph API, Error: {e}"
)
return []


if __name__ == "__main__":
pass
Original file line number Diff line number Diff line change
Expand Up @@ -154,15 +154,56 @@ def ask(
query=question, auth_context=auth_context, semantic_context=semantic_context
)
# Print chain input in formatted json
print(f"\nchain_input: {chain_input.json(indent=4)}")
print(f"\nchain_input: {chain_input.model_dump_json(indent=4)}")
return self.retrieval_chain.invoke(chain_input.dict())


def select_drive(drives: list) -> tuple:
"""
Select SharePoint drive from the available drives
"""
if not drives:
print("No drives found for the site. Exiting ...")
exit(1)
elif len(drives) == 1:
_drive_id = drives[0].get("id")
_drive_name = drives[0].get("name")
else:
# Select "Documents" as a default drive
def_drive_idx = next(
(
idx
for idx, drive in enumerate(drives)
if drive.get("name") == "Documents"
),
0,
)
# Select drive
# print("Select a drive ...")
print("Available drives on the site:")
for idx, drive in enumerate(drives):
print(f"\t{idx + 1}. {drive.get('name')}")

# Prompt user for drive index
_drive_idx = input(f"Enter drive index (default={def_drive_idx + 1}): ")
_drive_idx = int(_drive_idx) - 1 if _drive_idx else def_drive_idx
# Validate drive index and select default drive if invalid
if _drive_idx < 0 or _drive_idx >= len(drives):
print("Error. Invalid drive index! Selecting the default drive ...")
_drive_idx = def_drive_idx

# Get drive info
_drive_id = drives[_drive_idx].get("id")
_drive_name = drives[_drive_idx].get("name")
return _drive_id, _drive_name


if __name__ == "__main__":
input_collection_name = "identity-enabled-rag-sharepoint"
_client_id = os.environ.get("O365_CLIENT_ID")
_client_secret = os.environ.get("O365_CLIENT_SECRET")
_tenant_id = os.environ.get("O365_TENANT_ID")
_site_url = os.environ.get("SHAREPOINT_SITE_URL")

print("Please enter the app details to authenticate with Microsoft Graph API ...")
app_client_id = input(f"App client id ({_client_id}): ") or _client_id
Expand All @@ -171,12 +212,41 @@ def ask(
)
tenant_id = input(f"Tenant id ({_tenant_id}): ") or _tenant_id

print("\nPlease enter drive id for loading data...")
drive_id = input("Drive id : ")
print("\nInitializing SharepointADHelper ...")
sharepoint_helper = SharepointADHelper(
client_id=app_client_id,
client_secret=app_client_secret,
tenant_id=tenant_id,
)
print("SharepointADHelper initialized.\n")

site_url = (
input(f"Enter Sharepoint Site URL (default={_site_url}): ") or _site_url
)
if not site_url:
print("\nSite URL is required. Exiting ...")
exit(1)
# remove white spaces from the site url
site_url = site_url.strip()

# Get SharePoint Site ID using URL
site_id = sharepoint_helper.get_site_id(site_url)
print(f"Derived Site Id: {site_id}\n")

# Get drive info using site id
print("Fetching drive info ...\n")
drive_info = sharepoint_helper.get_drive_id(site_id)
drive_id, drive_name = select_drive(drive_info)
print(f"SharePoint Drive name: {drive_name}, Drive Id: {drive_id}\n")

# Enter Folder path
def_folder_path = "documents"
folder_path = input(f"Enter folder path (default='{def_folder_path}'): ") or def_folder_path

# Initialize PebbloSafeRAG app
rag_app = PebbloSafeRAG(
drive_id=drive_id,
folder_path="/document",
folder_path=folder_path,
collection_name=input_collection_name,
)

Expand All @@ -202,11 +272,9 @@ def ask(

prompt = input("Please provide the prompt : ")

authorized_identities = SharepointADHelper(
client_id=app_client_id,
client_secret=app_client_secret,
tenant_id=tenant_id,
).get_authorized_identities(end_user_email_address)
authorized_identities = sharepoint_helper.get_authorized_identities(
end_user_email_address
)

response = rag_app.ask(
prompt,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ OPENAI_API_KEY=<YOUR OPENAI API KEY>
O365_CLIENT_ID=<YOUR APPLICATION (CLIENT) ID>
O365_CLIENT_SECRET=<YOUR CLIENT SECRET>
O365_TENANT_ID=<YOUR TENANT ID>
SHAREPOINT_SITE_URL=<YOUR SHAREPOINT SITE URL>

# Pebblo configuration
PEBBLO_CLOUD_URL=<PEBBLO CLOUD URL>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
This solution uses:

- PostgreSQL 15.7
- langchain-community 0.2.6
- langchain-community 0.2.9
- LangChain Microsoft Sharepoint loader. See https://python.langchain.com/v0.2/docs/integrations/document_loaders/microsoft_sharepoint for details on specific steps required to be completed in Microsoft Office 365 and Azure Portal.

### Instructions
Expand Down Expand Up @@ -32,6 +32,7 @@ OPENAI_API_KEY=<YOUR OPENAI API KEY>
O365_CLIENT_ID=<YOUR APPLICATION (CLIENT) ID>
O365_CLIENT_SECRET=<YOUR CLIENT SECRET>
O365_TENANT_ID=<YOUR TENANT ID>
SHAREPOINT_SITE_URL=<YOUR SHAREPOINT SITE URL>

# Pebblo Cloud configuration (optional)
PEBBLO_CLOUD_URL=<PEBBLO CLOUD URL>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,88 @@ def get_access_token(self):
else:
return response.json()["access_token"]

@staticmethod
def format_site_url(site_url: str):
"""
Formats the site URL to include the colon(:) in the URL as required by the Microsoft Graph API.
Example:
1. Default site URL:
input: https://<tenant-name>.sharepoint.com/
output: tenant.sharepoint.com
2. Custom site URL:
input: https://<tenant-name>.sharepoint.com/sites/<site-name>
output: tenant.sharepoint.com:/sites/<site-name>

:param site_url: The original SharePoint site URL.
:return: The formatted site URL with a colon after the tenant domain.
"""

# Check if the site URL contains the "/sites/" substring and format the URL accordingly
if "/sites/" in site_url:
parts = site_url.split("/sites/")
if parts[0].endswith(":"):
# If the URL already contains a colon, use the URL as is
formatted_url = site_url
else:
# Add a colon after the tenant domain
formatted_url = f"{parts[0]}:/sites/{parts[1]}"
else:
formatted_url = site_url

# Remove the https:// prefix from the site URL
formatted_url = formatted_url.replace("https://", "")
return formatted_url

def get_site_id(self, site_url):
"""
This function retrieves the ID of a SharePoint site using the Microsoft Graph API.

Parameters:
site_url (str): The URL of the SharePoint site.

Returns:
str: The ID of the SharePoint site.
"""
# Format the site URL
site_url = self.format_site_url(site_url)
# Build URL to request site ID
full_url = f"https://graph.microsoft.com/v1.0/sites/{site_url}"
response = requests.get(
full_url, headers={"Authorization": f"Bearer {self.access_token}"}
)
site_id = response.json().get("id") # Return the site ID
return site_id

def get_drive_id(self, site_id):
"""
This function retrieves the IDs and names of all drives associated with a specified SharePoint site.

Parameters:
site_id (str): The ID of the SharePoint site.

Returns:
list: A list of dictionaries. Each dictionary represents a drive on the SharePoint site.
Each dictionary contains the following keys:
- 'id': The ID of the drive.
- 'name': The name of the drive.
"""

# Retrieve drive IDs and names associated with a site
try:
drives_url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives"
response = requests.get(drives_url, headers=self.headers)
drives = response.json().get("value", [])
drive_info = [
({"id": drive["id"], "name": drive["name"]}) for drive in drives
]
# print(f"Drive Info: {drive_info}")
return drive_info
except requests.exceptions.HTTPError as e:
print(
f"Error while retrieving document library ID from Microsoft Graph API, Error: {e}"
)
return []


if __name__ == "__main__":
pass
Loading