PyCafe - Streamlit - HTML to DataFrame Converter

app.py
requirements.txt
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import streamlit as st
import pandas as pd
import requests
import uuid
from bs4 import BeautifulSoup
def extract_html_to_dataframe(url):
    try:
        # Fetch HTML content
        response = requests.get(url)
        response.raise_for_status()
        # Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Initialize data structure
        data = []
        processed_nodes = set()  # To track processed text nodes
        
        # Generate unique IDs for nodes
        node_ids = {}
        
        def get_node_id(node):
            if node not in node_ids:
                node_ids[node] = str(uuid.uuid4())
            return node_ids[node]
        
        # Recursive function to traverse DOM
        def traverse_node(node, depth=0, parent_id=None, sibling_index=0):
            # Skip if None or not a tag or already processed
            if node is None:
                return
            
            current_id = get_node_id(node)
            
            # Base properties for all nodes
            node_data = {
                'node_id': current_id,
                'parent_id': parent_id,
                'depth': depth,
                'sibling_index': sibling_index,
                'tag_name': node.name if hasattr(node, 'name') else 'text',
                'text': '',
                'text_length': 0,
                'has_children': False,
                'class': ' '.join(node.get('class', [])) if hasattr(node, 'get') else '',
                'id': node.get('id', '') if hasattr(node, 'get') else '',
                'href': node.get('href', '') if hasattr(node, 'get') else '',
                'html_length': len(str(node)) if hasattr(node, '__str__') else 0
            }
            
            # Check if it's a leaf text node or has no children
            is_leaf = (not hasattr(node, 'contents')) or len(node.contents) == 0
            
            # Extract text only from leaf nodes to avoid duplication
            if is_leaf and hasattr(node, 'string') and node.string and node not in processed_nodes:
                text = node.string.strip()
                if text:
                    node_data['text'] = text
                    node_data['text_length'] = len(text)
                    processed_nodes.add(node)
            
            # Add node to data
            if node.name is not None or (hasattr(node, 'string') and node.string and node.string.strip()):
                data.append(node_data)
            
            # Process children if any
            if hasattr(node, 'contents') and len(node.contents) > 0:
                node_data['has_children'] = True
                
                for i, child in enumerate(node.contents):
                    traverse_node(child, depth + 1, current_id, i)
        
        # Start traversal from root
        traverse_node(soup)
        
        # Convert to DataFrame
        df = pd.DataFrame(data)
        
        return df
    
    except Exception as e:
        st.error(f"Error processing URL: {str(e)}")
        return pd.DataFrame()

# Streamlit app
st.title("HTML to DataFrame Converter")
st.write("Enter a URL to transform its HTML structure into a DataFrame")

url = st.text_input("Enter a URL:", "https://example.com")

if st.button("Convert"):
    with st.spinner("Processing HTML..."):
        df = extract_html_to_dataframe(url)
        
        if not df.empty:
            st.success(f"Successfully extracted {len(df)} nodes from HTML")
            
            # Show dataframe info
            st.subheader("HTML Structure as DataFrame")
            st.dataframe(df)
            
            # Show some stats
            st.subheader("HTML Structure Statistics")
            col1, col2, col3 = st.columns(3)
            with col1:
                st.metric("Total Nodes", len(df))
            with col2:
                st.metric("Max Depth", df['depth'].max())
            with col3:
                st.metric("Text Nodes", len(df[df['text_length'] > 0]))
            
            
            # Download option
            st.download_button(
                "Download DataFrame as CSV",
                df.to_csv(index=False).encode('utf-8'),
                "html_structure.csv",
                "text/csv",
                key='download-csv'
            )
        else:
            st.error("Failed to extract data from the URL")
Py.Cafe

html-to-dataframe-converter

HTML to DataFrame Converter