kgllm/app.py

284 lines
10 KiB
Python
Raw Normal View History

2024-06-09 13:50:16 -04:00
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, urldefrag
import wikipediaapi
from flask import Flask, request, render_template
from llama_cpp import Llama
from ampligraph.latent_features import ScoringBasedEmbeddingModel
from ampligraph.evaluation import mrr_score, hits_at_n_score
from ampligraph.latent_features.loss_functions import get as get_loss
from ampligraph.latent_features.regularizers import get as get_regularizer
from ampligraph.datasets import load_from_csv
import graphistry
import pandas as pd
import numpy as np
import tensorflow as tf
import logging
from cachetools import cached, TTLCache
from tqdm import tqdm
import fitz # PyMuPDF
import os
# Configuration
WIKIPEDIA_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
LLM_MODEL_PATH = "./dolphin-2_6-phi-2.Q4_K_M.gguf"
GRAPHISTRY_API = {
"api": 3,
"protocol": "https",
"server": "hub.graphistry.com",
"username": "mkommar",
"password": "pyrotech"
}
CACHE_TTL = 3600 # Cache TTL in seconds
# Configure logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
logger.debug = print
# Force TensorFlow to use the CPU
tf.config.set_visible_devices([], 'GPU')
# Initialize Flask app
app = Flask(__name__)
# Initialize Wikipedia API
wiki_wiki = wikipediaapi.Wikipedia(WIKIPEDIA_AGENT)
# Initialize llama_cpp model
model = Llama(
model_path=LLM_MODEL_PATH,
n_ctx=2048,
n_threads=8,
n_gpu_layers=35
)
# Register PyGraphistry
graphistry.register(**GRAPHISTRY_API)
# Cache
wiki_cache = TTLCache(maxsize=100, ttl=CACHE_TTL)
@cached(wiki_cache)
def fetch_wikipedia_data(query):
logger.debug(f"Fetching Wikipedia data for query: {query}")
page = wiki_wiki.page(query)
if page.exists():
logger.debug("Page found")
return page.text
logger.debug("Page not found")
return ""
def extract_text_from_pdf(pdf_url):
logger.debug(f"Extracting text from PDF: {pdf_url}")
response = requests.get(pdf_url)
response.raise_for_status()
with open('temp.pdf', 'wb') as f:
f.write(response.content)
pdf_text = ""
with fitz.open('temp.pdf') as doc:
for page in doc:
pdf_text += page.get_text()
os.remove('temp.pdf')
return pdf_text
def process_text(text):
logger.debug("Processing text with Llama_cpp")
chunk_size = 512
overlap = 50
knowledge_graph_elements = []
def process_chunk(chunk):
format = """{ source: "Term1", target: "Term2", relationship: "relation" },
{ source: "Term3", target: "Term4", relationship: "relation" },
{ source: "Term5", target: "Term6", relationship: "relation" },
{ source: "Term7", target: "Term8", relationship: "relation" },
{ source: "Term9", target: "Term10", relationship: "relation" }"""
query = f"Extract knowledge graph elements in the format: {format}. Use the actual terms from the text instead of placeholders like 'ConceptA' or 'ConceptB'.\n\nText: {chunk}"
logger.debug(f"Sending query to LLM: {query}")
response = model(f"User: {query}\nAI:", max_tokens=chunk_size, stop=["User:", "AI:"])['choices'][0]['text'].strip()
logger.debug(f"LLM response: {response}")
return extract_relationships(response)
def extract_relationships(response):
logger.debug("Extracting relationships from LLM response")
elements = []
lines = response.split('\n')
for line in lines:
if 'source' in line and 'target' in line and 'relationship' in line:
try:
parts = line.replace('{', '').replace('}', '').replace('"', '').strip().split(',')
element = {
'source': parts[0].split(':')[-1].strip(),
'target': parts[1].split(':')[-1].strip(),
'relationship': parts[2].split(':')[-1].strip()
}
elements.append(element)
except IndexError as e:
logger.error(f"Error parsing line: {line} - {e}")
continue
logger.debug(f"Extracted elements: {elements}")
return elements
start = 0
text = text
total_chunks = (len(text) - overlap) // (chunk_size - overlap)
for _ in tqdm(range(total_chunks), desc="Processing Text"):
end = min(start + chunk_size, len(text))
chunk = text[start:end]
knowledge_graph_elements.extend(process_chunk(chunk))
start = end - overlap
logger.debug(f"Final knowledge graph elements: {knowledge_graph_elements}")
return knowledge_graph_elements
def perform_knowledge_fusion(processed_text):
logger.debug("Performing knowledge fusion")
return processed_text
def train_ampligraph(X):
logger.debug("Training AmpliGraph model")
model = ScoringBasedEmbeddingModel(k=150,
eta=10,
scoring_type='ComplEx')
optim = tf.keras.optimizers.Adam(learning_rate=1e-3)
loss = get_loss('pairwise', {'margin': 0.5})
regularizer = get_regularizer('LP', {'p': 2, 'lambda': 1e-5})
model.compile(optimizer=optim, loss=loss, entity_relation_regularizer=regularizer)
model.fit(X)
logger.debug("Model training completed")
return model
def make_predictions(model, triples):
logger.debug("Making predictions with AmpliGraph model")
scores = model.predict(triples)
predictions = np.array(scores) > 0
logger.debug(f"Predictions: {predictions}")
return predictions
def visualize_data(data, predictions=None):
logger.debug("Visualizing data with PyGraphistry")
edges = []
for i, item in enumerate(data):
edge = {
'source': item['source'],
'target': item['target'],
'relationship': item['relationship']
}
if predictions is not None and i < len(predictions):
if predictions[i]:
edge['color'] = 'green'
else:
edge['color'] = 'red'
else:
edge['color'] = 'blue'
edges.append(edge)
df = pd.DataFrame(edges)
logger.debug(f"DataFrame for visualization: {df}")
g = graphistry.edges(df, 'source', 'target').bind(edge_title='relationship', edge_color='color')
g.plot()
def generate_knowledge_graph(query):
logger.debug(f"Generating knowledge graph for query: {query}")
text = fetch_wikipedia_data(query)
processed_text = process_text(text)
knowledge_data = perform_knowledge_fusion(processed_text)
logger.debug(f"Generated knowledge data: {knowledge_data}")
return knowledge_data
def suggest_pages(query):
logger.debug(f"Suggesting pages for query: {query}")
response = model(f"User: Suggest related Wikipedia pages for: {query}\nAI:", max_tokens=150, stop=["User:", "AI:"])['choices'][0]['text'].strip()
logger.debug(f"Suggested pages: {response}")
return response
def crawl_page_with_progress(url, depth=3):
logger.debug(f"Crawling page: {url} at depth: {depth}")
visited = set()
to_visit = [(url, 0)]
all_texts = []
added_urls = set()
total_to_visit = depth * 10 # Estimate, adjust based on expected average links per page
progress_bar = tqdm(total=total_to_visit, desc="Crawling Pages")
while to_visit:
current_url, current_depth = to_visit.pop(0)
if current_depth > depth or current_url in visited:
continue
start_time = time.time()
try:
response = requests.get(current_url)
response.raise_for_status()
visited.add(current_url)
end_time = time.time()
download_time = end_time - start_time
download_speed = len(response.content) / download_time / 1024 # KB/s
if current_url.endswith('.pdf'):
page_text = extract_text_from_pdf(current_url)
else:
soup = BeautifulSoup(response.text, 'html.parser')
page_text = soup.get_text()
all_texts.append(page_text)
logger.debug(f"Visited {current_url} - Download Speed: {download_speed:.2f} KB/s")
if current_depth < depth:
links_found = 0
for link in soup.find_all('a', href=True):
href = link['href']
full_url = urljoin(current_url, href)
full_url, _ = urldefrag(full_url) # Remove the fragment part
if urlparse(full_url).netloc == urlparse(url).netloc and full_url != current_url: # Stay within the same domain and avoid self-references
if full_url not in added_urls:
to_visit.append((full_url, current_depth + 1))
added_urls.add(full_url)
links_found += 1
progress_bar.update(links_found)
except requests.RequestException as e:
logger.error(f"Error crawling {current_url}: {e}")
progress_bar.update(1)
progress_bar.close()
combined_text = ' '.join(all_texts)
return combined_text
@app.route('/', methods=['GET', 'POST'])
def home():
if request.method == 'POST':
url = request.form['url']
crawl_depth = int(request.form.get('depth', 3))
logger.debug(f"Received URL: {url} with crawl depth: {crawl_depth}")
crawled_text = crawl_page_with_progress(url, depth=crawl_depth)
knowledge_data = process_text(crawled_text)
data_for_training = [(item['source'], item['relationship'], item['target']) for item in knowledge_data]
df = pd.DataFrame(data_for_training, columns=['source', 'predicate', 'target'])
df.to_csv('generated_knowledge_data.csv', index=False)
logger.debug(f"Data saved to CSV: {df}")
X = load_from_csv('.', 'generated_knowledge_data.csv', sep=',')
model = train_ampligraph(X)
predictions = make_predictions(model, X)
visualize_data(knowledge_data, predictions)
return render_template('result.html', url=url)
return render_template('index.html')
if __name__ == '__main__':
app.run(debug=True, port=8000)