code | Notion

# -*- coding: utf-8 -*-
"""
Created on Thu Aug 17 16:50:32 2023

@author: juanc
"""

import requests
from urllib import request
from bs4 import BeautifulSoup
import pandas as pd

# URL de la página web
url = '<https://hackerspace.govhack.org/data_sets>'

# Realizar la solicitud HTTP
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Encontrar la tabla por su clase o ID
table = soup.find('table', {'class': 'projects-table'})
header_row = table.find('thead').find('tr')
headers = [header.text.strip() for header in header_row.find_all('th')]
headers.append('url1')
headers.append('url2')

# Inicializar listas para almacenar los datos
data = []

# Extraer filas de la tabla
for row in table.find_all('tr'):
    cols = row.find_all('td')
    href=row.find_all('a')
    href = [a.get('href') for a in href]
    href=['<https://hackerspace.govhack.org>'+a for a in href]
    cols = [col.text.strip() for col in cols]
    cols=cols+href
    data.append(cols)

# Convertir los datos en un DataFrame de pandas
df = pd.DataFrame(data, columns=[headers])
df['Project'] = None
for index, row in df.iterrows():
    try:
        if  row['url2']is not None:
            url = row['url2']
    
    # Realizar una solicitud HTTP
            response = request.urlopen(url)

    # Obtener la URL de respuesta
            url_obtenida = response.geturl()
            df.at[index, 'url2'] = url_obtenida
    except:
        pass
    try:
        url = row['url1']

        # Realizar la solicitud HTTP
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Encontrar la tabla por su clase o ID
        descripcion = soup.find('strong',string=('Description:')).find_parent('p').get_text(strip=True).replace('Description:', '').strip()
        df.at[index, 'Description'] = descripcion
        project=soup.find('h2').get_text(strip=True)
        df.at[index, 'Project'] = project
    except:
        pass
df=df.dropna()
# Guardar el DataFrame en un archivo Excel
df.to_excel('datos_extraidos.xlsx', index=False)
df.to_csv('datos_extraidos.csv', sep='|', index=False)