Overview
Text summarizers use natural language processing (NLP) to generate summarized text from complex reports while retaining essential information. These tools encounter accurate-context queries, meaning the queries are clear about user needs.
However, text summarizers can disregard important information in input text or generate hallucinated outputs. To address this, Pythia monitors the text summarizer’s outputs in real-time and directs you toward more robust AI systems
Wisecube Python SDK allows for the integration of Pythia with text summarizers to ensure real-time hallucination detection.
Integrating Pythia with Chatbots
1. Get the API key
Submit the API key request form to get your Wisecube API key.
Install Wisecube SDK
pip install wisecube
4. Authenticate API key
Authenticate your Wisecube API key to interact with Pythia.
from wisecube_sdk.client import WisecubeClientAPI_KEY = "YOUR_API_KEY"
client = WisecubeClient(API_KEY).client
5. Develop a Text Summarizer
pip install beautifulsoup4
pip install lxml
pip install nltk
from urllib.request import urlopen
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
import heapq
nltk.download('punkt')
nltk.download('stopwords')
def summarize(text, num_sentences=5):
"""
Summarizes the given text by extracting the most important sentences.
Args:
text: The text to be summarized.
num_sentences: The number of sentences to include in the summary (default: 5).
Returns:
A string containing the summarized text.
"""
# Preprocess the text
sentences = sent_tokenize(text)
stop_words = stopwords.words('english')
filtered_sentences = [
[word for word in sentence.lower().split() if word not in stop_words]
for sentence in sentences
]
# Calculate sentence score based on word frequency (improved with TF-IDF)
from collections import Counter
word_counts = Counter()
total_words = 0
for sentence in filtered_sentences:
word_counts.update(sentence)
total_words += len(sentence)
sentence_scores = {i: 0 for i in range(len(filtered_sentences))}
for i, sentence in enumerate(filtered_sentences):
for word in sentence:
tf = word_counts[word] / total_words # Term Frequency
idf = sum(word in s for s in filtered_sentences) # Inverse Document Frequency (assuming single document)
sentence_scores[i] += tf * idf
# Select top scoring sentences for summary
summary_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
summary = ' '.join([sentences[i] for i in summary_sentences])
return summary
def summarize_from_web(url, num_sentences=5):
"""
Summarizes the text content of a webpage.
Args:
url: The URL of the webpage to summarize.
num_sentences: The number of sentences to include in the summary (default: 5).
Returns:
A string containing the summarized text from the webpage, or None if unable to access the webpage.
"""
try:
html_content = urlopen(url).read()
soup = BeautifulSoup(html_content, features="html.parser")
# Identify the main content area (adjust based on website structure)
text = soup.find_all('p') # Change this selector based on the website's content structure
# Combine paragraphs into a single string
text = ' '.join([paragraph.get_text() for paragraph in text])
summary = summarize(text, num_sentences)
accuracy = client.ask_pythia([text], summary, "")
return summary
except Exception as e:
print(f"Error accessing webpage: {e}")
return None
def get_summary(url):
summary = summarize_from_web(url)
if summary:
print(f"Summary: {summary}")
else:
return "Failed to summarize the webpage."
get_summary("https://en.wikipedia.org/wiki/Ancient_Rome")
6. Detect hallucinations with Pythia
Modify the summarize_from_web
and get_summary
functions to display summary and accuracy to the user.
def summarize_from_web(url, num_sentences=5):
try:
html_content = urlopen(url).read()
soup = BeautifulSoup(html_content, features="html.parser")
text = soup.find_all('p')
text = ' '.join([paragraph.get_text() for paragraph in text])
summary = summarize(text, num_sentences)
accuracy = client.ask_pythia([text], summary, "")
return summary, accuracy['data']['askPythia']['metrics']['accuracy']
except Exception as e:
print(f"Error accessing webpage: {e}")
return None
def get_summary(url):
summary, accuracy = summarize_from_web(url)
if summary:
print(f"Summary: {summary}")
print(f"Accuracy: {accuracy:.4f}")
else:
return "Failed to summarize the webpage."