import time import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse, urldefrag import wikipediaapi from flask import Flask, request, render_template from llama_cpp import Llama from ampligraph.latent_features import ScoringBasedEmbeddingModel from ampligraph.evaluation import mrr_score, hits_at_n_score from ampligraph.latent_features.loss_functions import get as get_loss from ampligraph.latent_features.regularizers import get as get_regularizer from ampligraph.datasets import load_from_csv import graphistry import pandas as pd import numpy as np import tensorflow as tf import logging from cachetools import cached, TTLCache from tqdm import tqdm import fitz # PyMuPDF import os # Configuration WIKIPEDIA_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36' LLM_MODEL_PATH = "./dolphin-2_6-phi-2.Q4_K_M.gguf" GRAPHISTRY_API = { "api": 3, "protocol": "https", "server": "hub.graphistry.com", "username": "mkommar", "password": "pyrotech" } CACHE_TTL = 3600 # Cache TTL in seconds # Configure logging logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) logger.debug = print # Force TensorFlow to use the CPU tf.config.set_visible_devices([], 'GPU') # Initialize Flask app app = Flask(__name__) # Initialize Wikipedia API wiki_wiki = wikipediaapi.Wikipedia(WIKIPEDIA_AGENT) # Initialize llama_cpp model model = Llama( model_path=LLM_MODEL_PATH, n_ctx=2048, n_threads=8, n_gpu_layers=35 ) # Register PyGraphistry graphistry.register(**GRAPHISTRY_API) # Cache wiki_cache = TTLCache(maxsize=100, ttl=CACHE_TTL) @cached(wiki_cache) def fetch_wikipedia_data(query): logger.debug(f"Fetching Wikipedia data for query: {query}") page = wiki_wiki.page(query) if page.exists(): logger.debug("Page found") return page.text logger.debug("Page not found") return "" def extract_text_from_pdf(pdf_url): logger.debug(f"Extracting text from PDF: {pdf_url}") response = requests.get(pdf_url) response.raise_for_status() with open('temp.pdf', 'wb') as f: f.write(response.content) pdf_text = "" with fitz.open('temp.pdf') as doc: for page in doc: pdf_text += page.get_text() os.remove('temp.pdf') return pdf_text def process_text(text): logger.debug("Processing text with Llama_cpp") chunk_size = 512 overlap = 50 knowledge_graph_elements = [] def process_chunk(chunk): format = """{ source: "Term1", target: "Term2", relationship: "relation" }, { source: "Term3", target: "Term4", relationship: "relation" }, { source: "Term5", target: "Term6", relationship: "relation" }, { source: "Term7", target: "Term8", relationship: "relation" }, { source: "Term9", target: "Term10", relationship: "relation" }""" query = f"Extract knowledge graph elements in the format: {format}. Use the actual terms from the text instead of placeholders like 'ConceptA' or 'ConceptB'.\n\nText: {chunk}" logger.debug(f"Sending query to LLM: {query}") response = model(f"User: {query}\nAI:", max_tokens=chunk_size, stop=["User:", "AI:"])['choices'][0]['text'].strip() logger.debug(f"LLM response: {response}") return extract_relationships(response) def extract_relationships(response): logger.debug("Extracting relationships from LLM response") elements = [] lines = response.split('\n') for line in lines: if 'source' in line and 'target' in line and 'relationship' in line: try: parts = line.replace('{', '').replace('}', '').replace('"', '').strip().split(',') element = { 'source': parts[0].split(':')[-1].strip(), 'target': parts[1].split(':')[-1].strip(), 'relationship': parts[2].split(':')[-1].strip() } elements.append(element) except IndexError as e: logger.error(f"Error parsing line: {line} - {e}") continue logger.debug(f"Extracted elements: {elements}") return elements start = 0 text = text total_chunks = (len(text) - overlap) // (chunk_size - overlap) for _ in tqdm(range(total_chunks), desc="Processing Text"): end = min(start + chunk_size, len(text)) chunk = text[start:end] knowledge_graph_elements.extend(process_chunk(chunk)) start = end - overlap logger.debug(f"Final knowledge graph elements: {knowledge_graph_elements}") return knowledge_graph_elements def perform_knowledge_fusion(processed_text): logger.debug("Performing knowledge fusion") return processed_text def train_ampligraph(X): logger.debug("Training AmpliGraph model") model = ScoringBasedEmbeddingModel(k=150, eta=10, scoring_type='ComplEx') optim = tf.keras.optimizers.Adam(learning_rate=1e-3) loss = get_loss('pairwise', {'margin': 0.5}) regularizer = get_regularizer('LP', {'p': 2, 'lambda': 1e-5}) model.compile(optimizer=optim, loss=loss, entity_relation_regularizer=regularizer) model.fit(X) logger.debug("Model training completed") return model def make_predictions(model, triples): logger.debug("Making predictions with AmpliGraph model") scores = model.predict(triples) predictions = np.array(scores) > 0 logger.debug(f"Predictions: {predictions}") return predictions def visualize_data(data, predictions=None): logger.debug("Visualizing data with PyGraphistry") edges = [] for i, item in enumerate(data): edge = { 'source': item['source'], 'target': item['target'], 'relationship': item['relationship'] } if predictions is not None and i < len(predictions): if predictions[i]: edge['color'] = 'green' else: edge['color'] = 'red' else: edge['color'] = 'blue' edges.append(edge) df = pd.DataFrame(edges) logger.debug(f"DataFrame for visualization: {df}") g = graphistry.edges(df, 'source', 'target').bind(edge_title='relationship', edge_color='color') g.plot() def generate_knowledge_graph(query): logger.debug(f"Generating knowledge graph for query: {query}") text = fetch_wikipedia_data(query) processed_text = process_text(text) knowledge_data = perform_knowledge_fusion(processed_text) logger.debug(f"Generated knowledge data: {knowledge_data}") return knowledge_data def suggest_pages(query): logger.debug(f"Suggesting pages for query: {query}") response = model(f"User: Suggest related Wikipedia pages for: {query}\nAI:", max_tokens=150, stop=["User:", "AI:"])['choices'][0]['text'].strip() logger.debug(f"Suggested pages: {response}") return response def crawl_page_with_progress(url, depth=3): logger.debug(f"Crawling page: {url} at depth: {depth}") visited = set() to_visit = [(url, 0)] all_texts = [] added_urls = set() total_to_visit = depth * 10 # Estimate, adjust based on expected average links per page progress_bar = tqdm(total=total_to_visit, desc="Crawling Pages") while to_visit: current_url, current_depth = to_visit.pop(0) if current_depth > depth or current_url in visited: continue start_time = time.time() try: response = requests.get(current_url) response.raise_for_status() visited.add(current_url) end_time = time.time() download_time = end_time - start_time download_speed = len(response.content) / download_time / 1024 # KB/s if current_url.endswith('.pdf'): page_text = extract_text_from_pdf(current_url) else: soup = BeautifulSoup(response.text, 'html.parser') page_text = soup.get_text() all_texts.append(page_text) logger.debug(f"Visited {current_url} - Download Speed: {download_speed:.2f} KB/s") if current_depth < depth: links_found = 0 for link in soup.find_all('a', href=True): href = link['href'] full_url = urljoin(current_url, href) full_url, _ = urldefrag(full_url) # Remove the fragment part if urlparse(full_url).netloc == urlparse(url).netloc and full_url != current_url: # Stay within the same domain and avoid self-references if full_url not in added_urls: to_visit.append((full_url, current_depth + 1)) added_urls.add(full_url) links_found += 1 progress_bar.update(links_found) except requests.RequestException as e: logger.error(f"Error crawling {current_url}: {e}") progress_bar.update(1) progress_bar.close() combined_text = ' '.join(all_texts) return combined_text @app.route('/', methods=['GET', 'POST']) def home(): if request.method == 'POST': url = request.form['url'] crawl_depth = int(request.form.get('depth', 3)) logger.debug(f"Received URL: {url} with crawl depth: {crawl_depth}") crawled_text = crawl_page_with_progress(url, depth=crawl_depth) knowledge_data = process_text(crawled_text) data_for_training = [(item['source'], item['relationship'], item['target']) for item in knowledge_data] df = pd.DataFrame(data_for_training, columns=['source', 'predicate', 'target']) df.to_csv('generated_knowledge_data.csv', index=False) logger.debug(f"Data saved to CSV: {df}") X = load_from_csv('.', 'generated_knowledge_data.csv', sep=',') model = train_ampligraph(X) predictions = make_predictions(model, X) visualize_data(knowledge_data, predictions) return render_template('result.html', url=url) return render_template('index.html') if __name__ == '__main__': app.run(debug=True, port=8000)