import streamlit as st
import pandas as pd
import requests
import uuid
from bs4 import BeautifulSoup
def extract_html_to_dataframe(url):
try:
# Fetch HTML content
response = requests.get(url)
response.raise_for_status()
# Parse HTML
soup = BeautifulSoup(response.text, 'html.parser')
# Initialize data structure
data = []
processed_nodes = set() # To track processed text nodes
# Generate unique IDs for nodes
node_ids = {}
def get_node_id(node):
if node not in node_ids:
node_ids[node] = str(uuid.uuid4())
return node_ids[node]
# Recursive function to traverse DOM
def traverse_node(node, depth=0, parent_id=None, sibling_index=0):
# Skip if None or not a tag or already processed
if node is None:
return
current_id = get_node_id(node)
# Base properties for all nodes
node_data = {
'node_id': current_id,
'parent_id': parent_id,
'depth': depth,
'sibling_index': sibling_index,
'tag_name': node.name if hasattr(node, 'name') else 'text',
'text': '',
'text_length': 0,
'has_children': False,
'class': ' '.join(node.get('class', [])) if hasattr(node, 'get') else '',
'id': node.get('id', '') if hasattr(node, 'get') else '',
'href': node.get('href', '') if hasattr(node, 'get') else '',
'html_length': len(str(node)) if hasattr(node, '__str__') else 0
}
# Check if it's a leaf text node or has no children
is_leaf = (not hasattr(node, 'contents')) or len(node.contents) == 0
# Extract text only from leaf nodes to avoid duplication
if is_leaf and hasattr(node, 'string') and node.string and node not in processed_nodes:
text = node.string.strip()
if text:
node_data['text'] = text
node_data['text_length'] = len(text)
processed_nodes.add(node)
# Add node to data
if node.name is not None or (hasattr(node, 'string') and node.string and node.string.strip()):
data.append(node_data)
# Process children if any
if hasattr(node, 'contents') and len(node.contents) > 0:
node_data['has_children'] = True
for i, child in enumerate(node.contents):
traverse_node(child, depth + 1, current_id, i)
# Start traversal from root
traverse_node(soup)
# Convert to DataFrame
df = pd.DataFrame(data)
return df
except Exception as e:
st.error(f"Error processing URL: {str(e)}")
return pd.DataFrame()
# Streamlit app
st.title("HTML to DataFrame Converter")
st.write("Enter a URL to transform its HTML structure into a DataFrame")
url = st.text_input("Enter a URL:", "https://example.com")
if st.button("Convert"):
with st.spinner("Processing HTML..."):
df = extract_html_to_dataframe(url)
if not df.empty:
st.success(f"Successfully extracted {len(df)} nodes from HTML")
# Show dataframe info
st.subheader("HTML Structure as DataFrame")
st.dataframe(df)
# Show some stats
st.subheader("HTML Structure Statistics")
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Nodes", len(df))
with col2:
st.metric("Max Depth", df['depth'].max())
with col3:
st.metric("Text Nodes", len(df[df['text_length'] > 0]))
# Download option
st.download_button(
"Download DataFrame as CSV",
df.to_csv(index=False).encode('utf-8'),
"html_structure.csv",
"text/csv",
key='download-csv'
)
else:
st.error("Failed to extract data from the URL")