# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
headers = {
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"accept-charset": "cp1254,ISO-8859-9,utf-8;q=0.7,*;q=0.3",
"accept-encoding": "gzip,deflate,sdch",
"accept-language": "tr,tr-TR,en-US,en;q=0.8",
}
def haber_oku(haber_url):
r = requests.get(haber_url, headers=headers)
if r.status_code != 200:
return
soup = BeautifulSoup(r.content)
result = soup.find("div", {'itemprop':'articleBody'})
if result:
return result.get_text()
else:
result = soup.find("div", {'itemprop':'description'})
if result:
return result.get_text()
return
def scrape_hurriyet(keywords,detay_goster):
url="http://www.hurriyet.com.tr"
keywords=keywords.split(',')
r = requests.get(url, headers=headers)
if r.status_code != 200:
print("request reddedildi")
return
soup = BeautifulSoup(r.content) #burası önemli
results = soup.findAll("a")
print ("Toplam link sayısı : ", len(results))
liste_link=[]
liste_text=[]
for result in results:
h = result.get('href')
t = result.get_text()
if h is not None:
if str(h).find('http://www.hurriyet.com.tr/')>=0:
if h not in liste_link:
if h.find('.asp')>0:
liste_link.append(h)
liste_text.append(t)
print ("Tekil linkler: ",len(liste_link))
i=0
while i< len(liste_link):
#print(liste_link[i],liste_text[i])
h = liste_link[i]
t = liste_text[i]
haber = haber_oku(h)
if haber is not None:
haber = BeautifulSoup(haber).get_text()
ok=0
found=""
for keyword in keywords:
if haber.find(keyword)>=0:
found = found +" "+keyword
ok+=1
if ok>0:
print (h, t, found)
if detay_goster is True:
print(haber)
print ('----------------------')
i+=1
keywords = 'deniz,liman,vapur,kaptan,marina'
scrape_hurriyet(keywords,True)