# Maker : L.T. # Date : 2019.02.20. # Description : 네이버 뉴스에서 특정 Keyword 검색 → 언론사, 기사 제목, 날짜, URL 추출하기 # import : requests, bs4 # How to in PyCharm : [File] - [Settings] - [Project: (projectname)] - [Project Interpreter] - "+" ### 모듈 import import requests from bs4 import BeautifulSoup from time import localtime, strftime import time from datetime import date ### 날짜 설정 # date_today date_today = date.today() date_today_year = str(date_today.year) date_today_month = str(date_today.month) date_today_day = str(date_today.day) date_today = str(date_today) # date_yesterday date_yesterday = date.fromtimestamp(time.time() - 60 * 60 * 24) date_yesterday_year = str(date_yesterday.year) date_yesterday_month = str(date_yesterday.month) date_yesterday_day = str(date_yesterday.day) date_yesterday = str(date_yesterday) ### URL - 네이버 뉴스 검색 결과 (RSS) newsurl = [ "http://newssearch.naver.com/search.naver?where=rss&query=%EC%A0%9C3%EC%9D%B8%ED%84%B0%EB%84%B7%EC%A0%84%EB%AC%B8%EC%9D%80%ED%96%89&field=0&nx_search_query=&nx_and_query=&nx_sub_query=&nx_search_hlquery=", "http://newssearch.naver.com/search.naver?where=rss&query=%EC%95%84%EB%A7%88%EC%A1%B4&field=0&nx_search_query=&nx_and_query=&nx_sub_query=&nx_search_hlquery=" ] ### Agent 환경 설정 headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", "accept-encoding": "gzip, deflate, br", "accept-language": "en-US,en;q=0.5", } ### 출력 파일의 위치, 이름 설정 dir_path = "" filetime = strftime("%Y-%m-%d_%H-%M-%S", localtime()) filename = "NEWS_" + filetime savefile = open(dir_path + filename, mode="w", encoding="utf-8") print("* * * * * * * * * * * * * * * * * ", filename, " * * * * * * * * * * * * * * * * *\n") ### 스크랩 print("☆ NEWS") savefile.write("☆ NEWS\n") news_rep_arr = [] news_rep_cnt = 0 newscnt = 0 try: while newscnt >= 0 and newscnt < len(newsurl): # send url + header r = requests.get(newsurl[newscnt], headers=headers) if r.status_code != 200: continue r.encoding = "utf-8" if newscnt == 0: news_title = " [NAVER news - 제3 인터넷 전문 은행]" elif newscnt == 1: news_title = " [NAVER news - 아마존]" else: break print(news_title, end="") savefile.write("\n" + news_title + "\n") # analysis soup = BeautifulSoup(r.text, "html.parser") ### 언론사, 기사 제목, 날짜, URL 추출하기 # separate tag contenthtml = soup.find_all("item") incnt = 0 totalcnt = 0 while incnt < contenthtml.__len__(): contentstr = str(contenthtml.__getitem__(incnt)) contentstr = str(contentstr).replace("link/", "a") contentstr = str(contentstr).replace(" <description>", "</a> <description>") contentstr = BeautifulSoup(contentstr, "html.parser") incnt += 1 ### 언론사 추출 authorstr = contentstr.find_all("author") author = (authorstr.__getitem__(0)).get_text() author = "[" + author + "]" ### 기사 제목 추출 titlestr = contentstr.find_all("title") title = (titlestr.__getitem__(0)).get_text() title = str(title).replace(""", "\"") title = str(title).replace("'", "\'") ### 날짜 추출 및 형식 변경 datestr = contentstr.find_all("pubdate") date = (datestr.__getitem__(0)).get_text() date = str(date).replace(",", "") date = date.replace("Jan", "01") date = date.replace("Feb", "02") date = date.replace("Mar", "03") date = date.replace("Apr", "04") date = date.replace("May", "05") date = date.replace("Jun", "06") date = date.replace("Jul", "07") date = date.replace("Aug", "08") date = date.replace("Sep", "09") date = date.replace("Oct", "10") date = date.replace("Nov", "11") date = date.replace("Dec", "12") date_split = date.split(" ") date_temp = date_split[3], date_split[2], date_split[1] date = "-".join(date_temp) date = date + " " + date_split[4] if not (date_today in date) and not (date_yesterday in date) and (date != ""): continue ### URL 추출 urlstr = contentstr.find_all("a") url = (urlstr.__getitem__(0)).get_text() ### 추출한 데이터 재가공 temp = " ", author, title, "\t|", date, "\n\t\t\t\t", url output = " ".join(temp) ### 추출한 데이터 → 파일에 쓰기(write) for i in range(0, news_rep_cnt + 1): if news_rep_cnt == 0: news_rep_arr.append(output) savefile.write(output + "\n") news_rep_cnt += 1 totalcnt += 1 elif news_rep_arr[i] == output: break elif (news_rep_arr[i] != output) and (i + 1 == news_rep_cnt): news_rep_arr.append(output) savefile.write(output + "\n") news_rep_cnt += 1 totalcnt += 1 break print(" : ", totalcnt) newscnt += 1 except: print("ERROR") print(("\n**************************************************************************************************")) savefile.close()