Como recuperar automaticamente chamadas de URL AJAX para?
O objetivo é programar um rastreador capaz de:
1) Recupere o URL dos links que estão na tabela desta página:http://cordis.europa.eu/fp7/security/projects_en.html
2) Siga a chamada AJAX de todos esses URLs para descobrir os URLs finais ("AJAX") que contêm os dados que eu quero raspar
3) Raspe as páginas finais identificadas pelos URLs AJAX.
Até agora, escrevi duas aranhas sob Scrapy:
1) O primeiro recupera o URL dos links na página inicial. Aqui está o código:
from scrapy.spider import Spider
from scrapy.selector import HtmlXPathSelector
from cordis.items import CordisItem
class MySpider(Spider):
name = "Cordis1"
allowed_domains = ["cordis.europa.eu"]
start_urls = ["http://cordis.europa.eu/fp7/security/projects_en.html"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p")
items = []
for titles in titles:
item = CordisItem()
item ["link"] = titles.select("//ul/li/span/a/@href").extract()
return item
2) O segundo raspa os dados dos URLs "AJAX". Aqui está o código:
from scrapy.spider import Spider
from scrapy.selector import Selector
class EssaiSpider(Spider):
name = "aze"
allowed_domains = ["cordis.europa.eu"]
start_urls = ["http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=95607",
"http://cordis.europa.eu/projects/index.cfm?fuseaction=app.csa&action=read&xslt-template=projects/xsl/projectdet_en.xslt&rcn=93528"]
def parse(self, response):
sel = Selector(response)
acronym = sel.xpath("//*[@class='projttl']/h1/text()").extract()
short_desc = sel.xpath("//*[@class='projttl']/h2/text()").extract()
start = sel.xpath("//*[@class='projdates']/b[1]/following::text()[1]").extract()
end = sel.xpath("//*[@class='projdates']/b[2]/following::text()[1]").extract()
long_desc = sel.xpath("//*[@class='tech']/p/text()").extract()
cost = sel.xpath("//*[@class='box-left']/b[3]/following::text()[1]").extract()
contrib = sel.xpath("//*[@class='box-left']/b[4]/following::text()[1]").extract()
type = sel.xpath("//*[@class='box-right']/p[3]/br/following::text()[1]").extract()
sujet = sel.xpath("//*[@id='subjects']/h2/following::text()[1]").extract()
coord = sel.xpath("//*[@class='projcoord']/div[1]/div[1]/text()").extract()
coord_nat = sel.xpath("//*[@class='projcoord']/div[1]/div[2]/text()").extract()
part = sel.xpath("//*[@class='participants']")
for part in part:
part1 = sel.xpath("//*[@id='part1']/div[1]/div[1]/text()").extract()
part1_nat = sel.xpath("//*[@id='part1']/div[1]/div[2]/text()").extract()
part2 = sel.xpath("//*[@id='part2']/div[1]/div[1]/text()").extract()
part2_nat = sel.xpath("//*[@id='part2']/div[1]/div[2]/text()").extract()
part3 = sel.xpath("//*[@id='part3']/div[1]/div[1]/text()").extract()
part3_nat = sel.xpath("//*[@id='part3']/div[1]/div[2]/text()").extract()
part4 = sel.xpath("//*[@id='part4']/div[1]/div[1]/text()").extract()
part4_nat = sel.xpath("//*[@id='part4']/div[1]/div[2]/text()").extract()
part5 = sel.xpath("//*[@id='part5']/div[1]/div[1]/text()").extract()
part5_nat = sel.xpath("//*[@id='part5']/div[1]/div[2]/text()").extract()
part6 = sel.xpath("//*[@id='part6']/div[1]/div[1]/text()").extract()
part6_nat = sel.xpath("//*[@id='part6']/div[1]/div[2]/text()").extract()
part7 = sel.xpath("//*[@id='part7']/div[1]/div[1]/text()").extract()
part7_nat = sel.xpath("//*[@id='part7']/div[1]/div[2]/text()").extract()
part8 = sel.xpath("//*[@id='part8']/div[1]/div[1]/text()").extract()
part8_nat = sel.xpath("//*[@id='part8']/div[1]/div[2]/text()").extract()
part9 = sel.xpath("//*[@id='part9']/div[1]/div[1]/text()").extract()
part9_nat = sel.xpath("//*[@id='part9']/div[1]/div[2]/text()").extract()
part10 = sel.xpath("//*[@id='part10']/div[1]/div[1]/text()").extract()
part10_nat = sel.xpath("//*[@id='part10']/div[1]/div[2]/text()").extract()
part11 = sel.xpath("//*[@id='part11']/div[1]/div[1]/text()").extract()
part11_nat = sel.xpath("//*[@id='part11']/div[1]/div[2]/text()").extract()
part12 = sel.xpath("//*[@id='part11']/div[1]/div[1]/text()").extract()
part12_nat = sel.xpath("//*[@id='part11']/div[1]/div[2]/text()").extract()
part13 = sel.xpath("//*[@id='part13']/div[1]/div[1]/text()").extract()
part13_nat = sel.xpath("//*[@id='part13']/div[1]/div[2]/text()").extract()
part13 = sel.xpath("//*[@id='part13']/div[1]/div[1]/text()").extract()
part13_nat = sel.xpath("//*[@id='part13']/div[1]/div[2]/text()").extract()
part14 = sel.xpath("//*[@id='part14']/div[1]/div[1]/text()").extract()
part14_nat = sel.xpath("//*[@id='part14']/div[1]/div[2]/text()").extract()
part15 = sel.xpath("//*[@id='part15']/div[1]/div[1]/text()").extract()
part15_nat = sel.xpath("//*[@id='part15']/div[1]/div[2]/text()").extract()
part16 = sel.xpath("//*[@id='part16']/div[1]/div[1]/text()").extract()
part16_nat = sel.xpath("//*[@id='part16']/div[1]/div[2]/text()").extract()
part17 = sel.xpath("//*[@id='part17']/div[1]/div[1]/text()").extract()
part17_nat = sel.xpath("//*[@id='part17']/div[1]/div[2]/text()").extract()
part18 = sel.xpath("//*[@id='part18']/div[1]/div[1]/text()").extract()
part18_nat = sel.xpath("//*[@id='part18']/div[1]/div[2]/text()").extract()
part19 = sel.xpath("//*[@id='part19']/div[1]/div[1]/text()").extract()
part2_nat = sel.xpath("//*[@id='part19']/div[1]/div[2]/text()").extract()
part20 = sel.xpath("//*[@id='part20']/div[1]/div[1]/text()").extract()
part20_nat = sel.xpath("//*[@id='part20']/div[1]/div[2]/text()").extract()
part21 = sel.xpath("//*[@id='part21']/div[1]/div[1]/text()").extract()
part21_nat = sel.xpath("//*[@id='part21']/div[1]/div[2]/text()").extract()
part22 = sel.xpath("//*[@id='part22']/div[1]/div[1]/text()").extract()
part22_nat = sel.xpath("//*[@id='part22']/div[1]/div[2]/text()").extract()
part23 = sel.xpath("//*[@id='part23']/div[1]/div[1]/text()").extract()
part23_nat = sel.xpath("//*[@id='part23']/div[1]/div[2]/text()").extract()
part24 = sel.xpath("//*[@id='part24']/div[1]/div[1]/text()").extract()
part24_nat = sel.xpath("//*[@id='part24']/div[1]/div[2]/text()").extract()
part25 = sel.xpath("//*[@id='part25']/div[1]/div[1]/text()").extract()
part25_nat = sel.xpath("//*[@id='part25']/div[1]/div[2]/text()").extract()
part26 = sel.xpath("//*[@id='part26']/div[1]/div[1]/text()").extract()
part26_nat = sel.xpath("//*[@id='part26']/div[1]/div[2]/text()").extract()
part27 = sel.xpath("//*[@id='part27']/div[1]/div[1]/text()").extract()
part27_nat = sel.xpath("//*[@id='part27']/div[1]/div[2]/text()").extract()
part28 = sel.xpath("//*[@id='part28']/div[1]/div[1]/text()").extract()
part28_nat = sel.xpath("//*[@id='part28']/div[1]/div[2]/text()").extract()
part29 = sel.xpath("//*[@id='part29']/div[1]/div[1]/text()").extract()
part29_nat = sel.xpath("//*[@id='part29']/div[1]/div[2]/text()").extract()
part30 = sel.xpath("//*[@id='part30']/div[1]/div[1]/text()").extract()
part30_nat = sel.xpath("//*[@id='part30']/div[1]/div[2]/text()").extract()
part31 = sel.xpath("//*[@id='part31']/div[1]/div[1]/text()").extract()
part31_nat = sel.xpath("//*[@id='part31']/div[1]/div[2]/text()").extract()
part32 = sel.xpath("//*[@id='part32']/div[1]/div[1]/text()").extract()
part32_nat = sel.xpath("//*[@id='part32']/div[1]/div[2]/text()").extract()
part33 = sel.xpath("//*[@id='part33']/div[1]/div[1]/text()").extract()
part33_nat = sel.xpath("//*[@id='part33']/div[1]/div[2]/text()").extract()
part34 = sel.xpath("//*[@id='part34']/div[1]/div[1]/text()").extract()
part34_nat = sel.xpath("//*[@id='part34']/div[1]/div[2]/text()").extract()
part35 = sel.xpath("//*[@id='part35']/div[1]/div[1]/text()").extract()
part35_nat = sel.xpath("//*[@id='part35']/div[1]/div[2]/text()").extract()
part36 = sel.xpath("//*[@id='part36']/div[1]/div[1]/text()").extract()
part36_nat = sel.xpath("//*[@id='part36']/div[1]/div[2]/text()").extract()
part37 = sel.xpath("//*[@id='part37']/div[1]/div[1]/text()").extract()
part37_nat = sel.xpath("//*[@id='part37']/div[1]/div[2]/text()").extract()
part38 = sel.xpath("//*[@id='part38']/div[1]/div[1]/text()").extract()
part38_nat = sel.xpath("//*[@id='part38']/div[1]/div[2]/text()").extract()
part39 = sel.xpath("//*[@id='part39']/div[1]/div[1]/text()").extract()
part39_nat = sel.xpath("//*[@id='part39']/div[1]/div[2]/text()").extract()
part40 = sel.xpath("//*[@id='part40']/div[1]/div[1]/text()").extract()
part40_nat = sel.xpath("//*[@id='part40']/div[1]/div[2]/text()").extract()
print acronym, short_desc, start, end, long_desc, cost, contrib, type, sujet, coord, coord_nat, part1, part1_nat, part2, part2_nat, part5, part5_nat, part10, part10_nat, part20, part20_nat, part30, part30_nat, part40, part40_nat
Eu poderia recuperar manualmente o que, por falta de termos melhores, chamei de URLs “AJAX”, filtrando solicitações XHR com Netbug para cada uma das URLs produzidas pelo primeiro Spider. Então, eu teria que alimentar esses URLs “AJAX” para o segundo Spider.
Mas é possível recuperar automaticamente esses URLs "AJAX"?
De maneira mais geral, como escrever uma única aranha de rastreamento executando todas as três operações descritas acima?