Python Requests ve BeautifulSoup paketleri
[youtube https://www.youtube.com/watch?v=r7__TtkGKbE&w=560&h=315]
Bu kez kodların açıklamalarına girmiyorum. Videoda yeterince ayrıntı mevcut. Kullandığım kodlar da aşağıda mevcut.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
# -*- coding: utf-8 -*- import requests from bs4 import BeautifulSoup headers = { "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "accept-charset": "cp1254,ISO-8859-9,utf-8;q=0.7,*;q=0.3", "accept-encoding": "gzip,deflate,sdch", "accept-language": "tr,tr-TR,en-US,en;q=0.8", } def haber_oku(haber_url): r = requests.get(haber_url, headers=headers) if r.status_code != 200: return soup = BeautifulSoup(r.content) result = soup.find("div", {'itemprop':'articleBody'}) if result: return result.get_text() else: result = soup.find("div", {'itemprop':'description'}) if result: return result.get_text() return def scrape_hurriyet(keywords,detay_goster): url="http://www.hurriyet.com.tr" keywords=keywords.split(',') r = requests.get(url, headers=headers) if r.status_code != 200: print("request reddedildi") return soup = BeautifulSoup(r.content) #burası önemli results = soup.findAll("a") print ("Toplam link sayısı : ", len(results)) liste_link=[] liste_text=[] for result in results: h = result.get('href') t = result.get_text() if h is not None: if str(h).find('http://www.hurriyet.com.tr/')>=0: if h not in liste_link: if h.find('.asp')>0: liste_link.append(h) liste_text.append(t) print ("Tekil linkler: ",len(liste_link)) i=0 while i< len(liste_link): #print(liste_link[i],liste_text[i]) h = liste_link[i] t = liste_text[i] haber = haber_oku(h) if haber is not None: haber = BeautifulSoup(haber).get_text() ok=0 found="" for keyword in keywords: if haber.find(keyword)>=0: found = found +" "+keyword ok+=1 if ok>0: print (h, t, found) if detay_goster is True: print(haber) print ('----------------------') i+=1 keywords = 'deniz,liman,vapur,kaptan,marina' scrape_hurriyet(keywords,True) |
Beni izlemeye devam edin.
ahmet aksoy