Qual é a melhor maneira de raspar os dados de um site?
Se você conhece python - I'd recomenda os módulos beautiful soup, splinter, and pandas.
splinter automatiza a entrada e recuperação da página web (ele usa um navegador real e assim pode trabalhar com páginas que precisam rodar javascript), e então beautifulsoup pode ser usado para analisar os dados, então pandas dataframe para escrever os dados no formato csv ou xls.
Código escrito para fazer isso - foi't muito difícil. Você'precisará instalar o python e os módulos pandas, beautifulsoup, e splinter para usá-lo. E você precisa do navegador firefox instalado. Usage is (from command line)
- python script.py datestart dateend
- python script.py 16-Sep-2014 20-Sep-2014
datestart and dateend are optional
It should run by just doubleclicking if you have your paths setup correct.
There isn't any error handling and other things that would be done in production code, but you get what you pay for :)
It runs a bit slow since I've added long delays since the site has slow response time and it is using a real browser due to having to execute javascript.
- # -*- coding: cp1252 -*-
- #!/usr/bin/python
- from splinter import Browser
- from bs4 import BeautifulSoup
- from time import sleep
- from datetime import datetime, timedelta
- import pandas as pd
- import sys, os
- def get_header_and_columns_from_html(soup):
- table = soup.find('table')
- head = table.findAll('th')
- header = [h.getText().strip() for h in head]
- all_columns = []
- for row in table.findAll('tr'):
- col = row.findAll('td')
- columns = [c.getText().strip() for c in col]
- if(columns):
- all_columns.append(columns)
- return header, all_columns
- def get_flight_table_for_city_on_date(citycode, date):
- url = r"http://bristowgroup.com/clients/flight-status"
- df = None
- with Browser() as browser:
- browser.visit(url)
- sleep(3)
- city = '//select[@id="id_base"]/option[@value="{}"]'.format(citycode)
- browser.find_by_xpath(city)._element.click()
- sleep(3)
- browser.find_by_id('id_request_date').fill(date+"\t")
- sleep(3)
- browser.find_by_name('submit').click()
- if browser.is_text_present('Important Information', wait_time=7):
- html_source = browser.html
- soup = BeautifulSoup(html_source, 'html.parser')
- try:
- header, all_columns = get_header_and_columns_from_html(soup)
- df = pd.DataFrame(data=all_columns, columns=header)
- return df
- except:
- return None
- else:
- return None
- def get_dates():
- months = ['', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
- 'Jul', 'Aug', 'Sep', 'Nov', 'Dec']
- now = datetime.now()
- date = "{}-{}-{}".format(now.day, months[now.month], now.year)
- if len(sys.argv) == 1:
- return [date]
- elif len(sys.argv) == 2:
- date = sys.argv[1]
- return [date]
- elif len(sys.argv) == 3:
- dates = []
- d1 = datetime.strptime(sys.argv[1], "%d-%b-%Y").date()
- d2 = datetime.strptime(sys.argv[2], "%d-%b-%Y").date()
- delta = d2 - d1
- for i in range(delta.days + 1):
- d = d1 + timedelta(days=i)
- dates.append(d.strftime("%d-%b-%Y").lstrip('0'))
- return dates
- else:
- return None
- def create_folder(directory, foldername):
- if not directory:
- directory = os.path.curdir
- outdir = directory+r"/{}".format(foldername)
- if not os.path.exists(outdir):
- os.makedirs(outdir)
- return outdir
- if __name__ == '__main__':
- #for a specific cities you can comment out the cities that you don't want
- cities ={
- "20":"Bergen",
- "47":"Brønnøysund",
- "19":"Den Helder",
- "21":"Hammerfest",
- "30":"Humberside",
- "18":"Norwich",
- "2":"Scatsta",
- "13":"Sola"
- }
- dates = get_dates()
- directory = r"" #replace with a string containing your prefered path, default will be the directory the script is ran from
- for citycode in cities:
- for date in dates:
- outdir = create_folder(directory, date)
- flights_table = get_flight_table_for_city_on_date(citycode, date)
- if type(flights_table) != type(None):
- out_path = r"{}\{}_{}.csv".format(outdir, date, cities[citycode])
- flights_table.to_csv(out_path)
- else:
- out_path = "{}\{}_{}_log.txt".format(outdir, date, cities[citycode])
- with open(out_path, 'w') as f:
- f.write("failed {} {}".format(date, cities[citycode]))