Main initial commit
This commit is contained in:
commit
7d10b7bafa
283
app.py
Normal file
283
app.py
Normal file
|
@ -0,0 +1,283 @@
|
||||||
|
import time
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from urllib.parse import urljoin, urlparse, urldefrag
|
||||||
|
import wikipediaapi
|
||||||
|
from flask import Flask, request, render_template
|
||||||
|
from llama_cpp import Llama
|
||||||
|
from ampligraph.latent_features import ScoringBasedEmbeddingModel
|
||||||
|
from ampligraph.evaluation import mrr_score, hits_at_n_score
|
||||||
|
from ampligraph.latent_features.loss_functions import get as get_loss
|
||||||
|
from ampligraph.latent_features.regularizers import get as get_regularizer
|
||||||
|
from ampligraph.datasets import load_from_csv
|
||||||
|
import graphistry
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import tensorflow as tf
|
||||||
|
import logging
|
||||||
|
from cachetools import cached, TTLCache
|
||||||
|
from tqdm import tqdm
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
WIKIPEDIA_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
|
||||||
|
LLM_MODEL_PATH = "./dolphin-2_6-phi-2.Q4_K_M.gguf"
|
||||||
|
GRAPHISTRY_API = {
|
||||||
|
"api": 3,
|
||||||
|
"protocol": "https",
|
||||||
|
"server": "hub.graphistry.com",
|
||||||
|
"username": "mkommar",
|
||||||
|
"password": "pyrotech"
|
||||||
|
}
|
||||||
|
CACHE_TTL = 3600 # Cache TTL in seconds
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.debug = print
|
||||||
|
|
||||||
|
# Force TensorFlow to use the CPU
|
||||||
|
tf.config.set_visible_devices([], 'GPU')
|
||||||
|
|
||||||
|
# Initialize Flask app
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
# Initialize Wikipedia API
|
||||||
|
wiki_wiki = wikipediaapi.Wikipedia(WIKIPEDIA_AGENT)
|
||||||
|
|
||||||
|
# Initialize llama_cpp model
|
||||||
|
model = Llama(
|
||||||
|
model_path=LLM_MODEL_PATH,
|
||||||
|
n_ctx=2048,
|
||||||
|
n_threads=8,
|
||||||
|
n_gpu_layers=35
|
||||||
|
)
|
||||||
|
|
||||||
|
# Register PyGraphistry
|
||||||
|
graphistry.register(**GRAPHISTRY_API)
|
||||||
|
|
||||||
|
# Cache
|
||||||
|
wiki_cache = TTLCache(maxsize=100, ttl=CACHE_TTL)
|
||||||
|
|
||||||
|
@cached(wiki_cache)
|
||||||
|
def fetch_wikipedia_data(query):
|
||||||
|
logger.debug(f"Fetching Wikipedia data for query: {query}")
|
||||||
|
page = wiki_wiki.page(query)
|
||||||
|
if page.exists():
|
||||||
|
logger.debug("Page found")
|
||||||
|
return page.text
|
||||||
|
logger.debug("Page not found")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def extract_text_from_pdf(pdf_url):
|
||||||
|
logger.debug(f"Extracting text from PDF: {pdf_url}")
|
||||||
|
response = requests.get(pdf_url)
|
||||||
|
response.raise_for_status()
|
||||||
|
with open('temp.pdf', 'wb') as f:
|
||||||
|
f.write(response.content)
|
||||||
|
|
||||||
|
pdf_text = ""
|
||||||
|
with fitz.open('temp.pdf') as doc:
|
||||||
|
for page in doc:
|
||||||
|
pdf_text += page.get_text()
|
||||||
|
|
||||||
|
os.remove('temp.pdf')
|
||||||
|
return pdf_text
|
||||||
|
|
||||||
|
def process_text(text):
|
||||||
|
logger.debug("Processing text with Llama_cpp")
|
||||||
|
chunk_size = 512
|
||||||
|
overlap = 50
|
||||||
|
knowledge_graph_elements = []
|
||||||
|
|
||||||
|
def process_chunk(chunk):
|
||||||
|
format = """{ source: "Term1", target: "Term2", relationship: "relation" },
|
||||||
|
{ source: "Term3", target: "Term4", relationship: "relation" },
|
||||||
|
{ source: "Term5", target: "Term6", relationship: "relation" },
|
||||||
|
{ source: "Term7", target: "Term8", relationship: "relation" },
|
||||||
|
{ source: "Term9", target: "Term10", relationship: "relation" }"""
|
||||||
|
query = f"Extract knowledge graph elements in the format: {format}. Use the actual terms from the text instead of placeholders like 'ConceptA' or 'ConceptB'.\n\nText: {chunk}"
|
||||||
|
logger.debug(f"Sending query to LLM: {query}")
|
||||||
|
response = model(f"User: {query}\nAI:", max_tokens=chunk_size, stop=["User:", "AI:"])['choices'][0]['text'].strip()
|
||||||
|
logger.debug(f"LLM response: {response}")
|
||||||
|
return extract_relationships(response)
|
||||||
|
|
||||||
|
def extract_relationships(response):
|
||||||
|
logger.debug("Extracting relationships from LLM response")
|
||||||
|
elements = []
|
||||||
|
lines = response.split('\n')
|
||||||
|
for line in lines:
|
||||||
|
if 'source' in line and 'target' in line and 'relationship' in line:
|
||||||
|
try:
|
||||||
|
parts = line.replace('{', '').replace('}', '').replace('"', '').strip().split(',')
|
||||||
|
element = {
|
||||||
|
'source': parts[0].split(':')[-1].strip(),
|
||||||
|
'target': parts[1].split(':')[-1].strip(),
|
||||||
|
'relationship': parts[2].split(':')[-1].strip()
|
||||||
|
}
|
||||||
|
elements.append(element)
|
||||||
|
except IndexError as e:
|
||||||
|
logger.error(f"Error parsing line: {line} - {e}")
|
||||||
|
continue
|
||||||
|
logger.debug(f"Extracted elements: {elements}")
|
||||||
|
return elements
|
||||||
|
|
||||||
|
start = 0
|
||||||
|
text = text
|
||||||
|
total_chunks = (len(text) - overlap) // (chunk_size - overlap)
|
||||||
|
for _ in tqdm(range(total_chunks), desc="Processing Text"):
|
||||||
|
end = min(start + chunk_size, len(text))
|
||||||
|
chunk = text[start:end]
|
||||||
|
knowledge_graph_elements.extend(process_chunk(chunk))
|
||||||
|
start = end - overlap
|
||||||
|
|
||||||
|
logger.debug(f"Final knowledge graph elements: {knowledge_graph_elements}")
|
||||||
|
return knowledge_graph_elements
|
||||||
|
|
||||||
|
def perform_knowledge_fusion(processed_text):
|
||||||
|
logger.debug("Performing knowledge fusion")
|
||||||
|
return processed_text
|
||||||
|
|
||||||
|
def train_ampligraph(X):
|
||||||
|
logger.debug("Training AmpliGraph model")
|
||||||
|
model = ScoringBasedEmbeddingModel(k=150,
|
||||||
|
eta=10,
|
||||||
|
scoring_type='ComplEx')
|
||||||
|
|
||||||
|
optim = tf.keras.optimizers.Adam(learning_rate=1e-3)
|
||||||
|
loss = get_loss('pairwise', {'margin': 0.5})
|
||||||
|
regularizer = get_regularizer('LP', {'p': 2, 'lambda': 1e-5})
|
||||||
|
|
||||||
|
model.compile(optimizer=optim, loss=loss, entity_relation_regularizer=regularizer)
|
||||||
|
|
||||||
|
model.fit(X)
|
||||||
|
logger.debug("Model training completed")
|
||||||
|
return model
|
||||||
|
|
||||||
|
def make_predictions(model, triples):
|
||||||
|
logger.debug("Making predictions with AmpliGraph model")
|
||||||
|
scores = model.predict(triples)
|
||||||
|
predictions = np.array(scores) > 0
|
||||||
|
logger.debug(f"Predictions: {predictions}")
|
||||||
|
return predictions
|
||||||
|
|
||||||
|
def visualize_data(data, predictions=None):
|
||||||
|
logger.debug("Visualizing data with PyGraphistry")
|
||||||
|
edges = []
|
||||||
|
for i, item in enumerate(data):
|
||||||
|
edge = {
|
||||||
|
'source': item['source'],
|
||||||
|
'target': item['target'],
|
||||||
|
'relationship': item['relationship']
|
||||||
|
}
|
||||||
|
if predictions is not None and i < len(predictions):
|
||||||
|
if predictions[i]:
|
||||||
|
edge['color'] = 'green'
|
||||||
|
else:
|
||||||
|
edge['color'] = 'red'
|
||||||
|
else:
|
||||||
|
edge['color'] = 'blue'
|
||||||
|
edges.append(edge)
|
||||||
|
|
||||||
|
df = pd.DataFrame(edges)
|
||||||
|
logger.debug(f"DataFrame for visualization: {df}")
|
||||||
|
g = graphistry.edges(df, 'source', 'target').bind(edge_title='relationship', edge_color='color')
|
||||||
|
g.plot()
|
||||||
|
|
||||||
|
def generate_knowledge_graph(query):
|
||||||
|
logger.debug(f"Generating knowledge graph for query: {query}")
|
||||||
|
text = fetch_wikipedia_data(query)
|
||||||
|
processed_text = process_text(text)
|
||||||
|
knowledge_data = perform_knowledge_fusion(processed_text)
|
||||||
|
logger.debug(f"Generated knowledge data: {knowledge_data}")
|
||||||
|
return knowledge_data
|
||||||
|
|
||||||
|
def suggest_pages(query):
|
||||||
|
logger.debug(f"Suggesting pages for query: {query}")
|
||||||
|
response = model(f"User: Suggest related Wikipedia pages for: {query}\nAI:", max_tokens=150, stop=["User:", "AI:"])['choices'][0]['text'].strip()
|
||||||
|
logger.debug(f"Suggested pages: {response}")
|
||||||
|
return response
|
||||||
|
|
||||||
|
def crawl_page_with_progress(url, depth=3):
|
||||||
|
logger.debug(f"Crawling page: {url} at depth: {depth}")
|
||||||
|
visited = set()
|
||||||
|
to_visit = [(url, 0)]
|
||||||
|
all_texts = []
|
||||||
|
added_urls = set()
|
||||||
|
|
||||||
|
total_to_visit = depth * 10 # Estimate, adjust based on expected average links per page
|
||||||
|
progress_bar = tqdm(total=total_to_visit, desc="Crawling Pages")
|
||||||
|
|
||||||
|
while to_visit:
|
||||||
|
current_url, current_depth = to_visit.pop(0)
|
||||||
|
if current_depth > depth or current_url in visited:
|
||||||
|
continue
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(current_url)
|
||||||
|
response.raise_for_status()
|
||||||
|
visited.add(current_url)
|
||||||
|
end_time = time.time()
|
||||||
|
download_time = end_time - start_time
|
||||||
|
download_speed = len(response.content) / download_time / 1024 # KB/s
|
||||||
|
|
||||||
|
if current_url.endswith('.pdf'):
|
||||||
|
page_text = extract_text_from_pdf(current_url)
|
||||||
|
else:
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
page_text = soup.get_text()
|
||||||
|
|
||||||
|
all_texts.append(page_text)
|
||||||
|
|
||||||
|
logger.debug(f"Visited {current_url} - Download Speed: {download_speed:.2f} KB/s")
|
||||||
|
|
||||||
|
if current_depth < depth:
|
||||||
|
links_found = 0
|
||||||
|
for link in soup.find_all('a', href=True):
|
||||||
|
href = link['href']
|
||||||
|
full_url = urljoin(current_url, href)
|
||||||
|
full_url, _ = urldefrag(full_url) # Remove the fragment part
|
||||||
|
if urlparse(full_url).netloc == urlparse(url).netloc and full_url != current_url: # Stay within the same domain and avoid self-references
|
||||||
|
if full_url not in added_urls:
|
||||||
|
to_visit.append((full_url, current_depth + 1))
|
||||||
|
added_urls.add(full_url)
|
||||||
|
links_found += 1
|
||||||
|
progress_bar.update(links_found)
|
||||||
|
|
||||||
|
except requests.RequestException as e:
|
||||||
|
logger.error(f"Error crawling {current_url}: {e}")
|
||||||
|
progress_bar.update(1)
|
||||||
|
|
||||||
|
progress_bar.close()
|
||||||
|
combined_text = ' '.join(all_texts)
|
||||||
|
return combined_text
|
||||||
|
|
||||||
|
@app.route('/', methods=['GET', 'POST'])
|
||||||
|
def home():
|
||||||
|
if request.method == 'POST':
|
||||||
|
url = request.form['url']
|
||||||
|
crawl_depth = int(request.form.get('depth', 3))
|
||||||
|
logger.debug(f"Received URL: {url} with crawl depth: {crawl_depth}")
|
||||||
|
|
||||||
|
crawled_text = crawl_page_with_progress(url, depth=crawl_depth)
|
||||||
|
knowledge_data = process_text(crawled_text)
|
||||||
|
|
||||||
|
data_for_training = [(item['source'], item['relationship'], item['target']) for item in knowledge_data]
|
||||||
|
df = pd.DataFrame(data_for_training, columns=['source', 'predicate', 'target'])
|
||||||
|
df.to_csv('generated_knowledge_data.csv', index=False)
|
||||||
|
logger.debug(f"Data saved to CSV: {df}")
|
||||||
|
X = load_from_csv('.', 'generated_knowledge_data.csv', sep=',')
|
||||||
|
model = train_ampligraph(X)
|
||||||
|
|
||||||
|
predictions = make_predictions(model, X)
|
||||||
|
|
||||||
|
visualize_data(knowledge_data, predictions)
|
||||||
|
|
||||||
|
return render_template('result.html', url=url)
|
||||||
|
return render_template('index.html')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app.run(debug=True, port=8000)
|
250
application.log
Normal file
250
application.log
Normal file
File diff suppressed because one or more lines are too long
29227
generated_knowledge_data.csv
Normal file
29227
generated_knowledge_data.csv
Normal file
File diff suppressed because it is too large
Load Diff
12
requirements.txt
Normal file
12
requirements.txt
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
llama-cpp-python
|
||||||
|
ampligraph==2.1.0
|
||||||
|
tensorflow-macos==2.9.0
|
||||||
|
tensorflow-metal==0.6
|
||||||
|
graphistry
|
||||||
|
graphistry[bolt,gremlin,nodexl,igraph,networkx]
|
||||||
|
wikipedia-api
|
||||||
|
flask
|
||||||
|
pandas
|
||||||
|
bs4
|
||||||
|
tqdm
|
||||||
|
pymupdf
|
101
static/graph.js
Normal file
101
static/graph.js
Normal file
|
@ -0,0 +1,101 @@
|
||||||
|
// graph.js
|
||||||
|
|
||||||
|
document.addEventListener('DOMContentLoaded', function() {
|
||||||
|
const data = {
|
||||||
|
nodes: [
|
||||||
|
{ id: "Force", label: "Force" },
|
||||||
|
{ id: "Mass", label: "Mass" },
|
||||||
|
{ id: "Acceleration", label: "Acceleration" },
|
||||||
|
{ id: "Gravity", label: "Gravity" },
|
||||||
|
{ id: "Friction", label: "Friction" },
|
||||||
|
{ id: "Motion", label: "Motion" }
|
||||||
|
],
|
||||||
|
links: [
|
||||||
|
{ source: "Force", target: "Mass", relationship: "is_applied_on" },
|
||||||
|
{ source: "Force", target: "Acceleration", relationship: "causes" },
|
||||||
|
{ source: "Gravity", target: "Mass", relationship: "acts_on" },
|
||||||
|
{ source: "Friction", target: "Motion", relationship: "opposes" },
|
||||||
|
{ source: "Force", target: "Friction", relationship: "overcomes" }
|
||||||
|
]
|
||||||
|
};
|
||||||
|
|
||||||
|
const width = 800;
|
||||||
|
const height = 600;
|
||||||
|
|
||||||
|
const svg = d3.select("#graph").append("svg")
|
||||||
|
.attr("width", width)
|
||||||
|
.attr("height", height);
|
||||||
|
|
||||||
|
const simulation = d3.forceSimulation(data.nodes)
|
||||||
|
.force("link", d3.forceLink(data.links).id(d => d.id).distance(200))
|
||||||
|
.force("charge", d3.forceManyBody().strength(-500))
|
||||||
|
.force("center", d3.forceCenter(width / 2, height / 2));
|
||||||
|
|
||||||
|
const link = svg.append("g")
|
||||||
|
.attr("stroke", "#999")
|
||||||
|
.attr("stroke-opacity", 0.6)
|
||||||
|
.selectAll("line")
|
||||||
|
.data(data.links)
|
||||||
|
.enter().append("line")
|
||||||
|
.attr("stroke-width", d => Math.sqrt(d.value));
|
||||||
|
|
||||||
|
const node = svg.append("g")
|
||||||
|
.attr("stroke", "#fff")
|
||||||
|
.attr("stroke-width", 1.5)
|
||||||
|
.selectAll("circle")
|
||||||
|
.data(data.nodes)
|
||||||
|
.enter().append("circle")
|
||||||
|
.attr("r", 10)
|
||||||
|
.attr("fill", "red")
|
||||||
|
.call(d3.drag()
|
||||||
|
.on("start", dragstarted)
|
||||||
|
.on("drag", dragged)
|
||||||
|
.on("end", dragended));
|
||||||
|
|
||||||
|
node.append("title")
|
||||||
|
.text(d => d.id);
|
||||||
|
|
||||||
|
// Add labels
|
||||||
|
const labels = svg.append("g")
|
||||||
|
.selectAll("text")
|
||||||
|
.data(data.nodes)
|
||||||
|
.enter().append("text")
|
||||||
|
.attr("x", d => d.x)
|
||||||
|
.attr("y", d => d.y)
|
||||||
|
.attr("dy", -3)
|
||||||
|
.attr("text-anchor", "middle")
|
||||||
|
.text(d => d.label);
|
||||||
|
|
||||||
|
simulation.on("tick", () => {
|
||||||
|
link
|
||||||
|
.attr("x1", d => d.source.x)
|
||||||
|
.attr("y1", d => d.source.y)
|
||||||
|
.attr("x2", d => d.target.x)
|
||||||
|
.attr("y2", d => d.target.y);
|
||||||
|
|
||||||
|
node
|
||||||
|
.attr("cx", d => d.x)
|
||||||
|
.attr("cy", d => d.y);
|
||||||
|
|
||||||
|
labels
|
||||||
|
.attr("x", d => d.x)
|
||||||
|
.attr("y", d => d.y);
|
||||||
|
});
|
||||||
|
|
||||||
|
function dragstarted(event, d) {
|
||||||
|
if (!event.active) simulation.alphaTarget(0.3).restart();
|
||||||
|
d.fx = d.x;
|
||||||
|
d.fy = d.y;
|
||||||
|
}
|
||||||
|
|
||||||
|
function dragged(event, d) {
|
||||||
|
d.fx = event.x;
|
||||||
|
d.fy = event.y;
|
||||||
|
}
|
||||||
|
|
||||||
|
function dragended(event, d) {
|
||||||
|
if (!event.active) simulation.alphaTarget(0);
|
||||||
|
d.fx = null;
|
||||||
|
d.fy = null;
|
||||||
|
}
|
||||||
|
});
|
13
templates/index.html
Normal file
13
templates/index.html
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Knowledge Fusion App</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<form method="POST">
|
||||||
|
<input type="text" name="url" placeholder="Enter the URL">
|
||||||
|
<input type="number" name="depth" placeholder="Crawl depth" value="3" min="1" max="5">
|
||||||
|
<input type="submit" value="Search">
|
||||||
|
</form>
|
||||||
|
</body>
|
||||||
|
</html>
|
12
templates/result.html
Normal file
12
templates/result.html
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Knowledge Fusion Result</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Results for "{{ url }}"</h1>
|
||||||
|
<div id="graph"></div>
|
||||||
|
<script src="https://d3js.org/d3.v5.min.js"></script>
|
||||||
|
<script src="/static/graph.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
6
your_dataset.csv
Normal file
6
your_dataset.csv
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
subject,predicate,object
|
||||||
|
Force,is_applied_on,Mass
|
||||||
|
Force,causes,Acceleration
|
||||||
|
Gravity,acts_on,Mass
|
||||||
|
Friction,opposes,Motion
|
||||||
|
Force,overcomes,Friction
|
|
Loading…
Reference in New Issue
Block a user