Python (2018)

#004. 스크래핑

# Maker : L.T.
# Date : 2019.02.20.
# Description : 네이버 뉴스에서 특정 Keyword 검색 → 언론사, 기사 제목, 날짜, URL 추출하기
# import : requests, bs4
#          How to in PyCharm : [File] - [Settings] - [Project: (projectname)] - [Project Interpreter] - "+"

### 모듈 import
import requests
from bs4 import BeautifulSoup
from time import localtime, strftime
import time
from datetime import date


### 날짜 설정
# date_today
date_today = date.today()
date_today_year = str(date_today.year)
date_today_month = str(date_today.month)
date_today_day = str(date_today.day)
date_today = str(date_today)

# date_yesterday
date_yesterday = date.fromtimestamp(time.time() - 60 * 60 * 24)
date_yesterday_year = str(date_yesterday.year)
date_yesterday_month = str(date_yesterday.month)
date_yesterday_day = str(date_yesterday.day)
date_yesterday = str(date_yesterday)


### URL - 네이버 뉴스 검색 결과 (RSS)
newsurl = [
    "http://newssearch.naver.com/search.naver?where=rss&query=%EC%A0%9C3%EC%9D%B8%ED%84%B0%EB%84%B7%EC%A0%84%EB%AC%B8%EC%9D%80%ED%96%89&field=0&nx_search_query=&nx_and_query=&nx_sub_query=&nx_search_hlquery=",
    "http://newssearch.naver.com/search.naver?where=rss&query=%EC%95%84%EB%A7%88%EC%A1%B4&field=0&nx_search_query=&nx_and_query=&nx_sub_query=&nx_search_hlquery="
]


### Agent 환경 설정
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "en-US,en;q=0.5",
}


### 출력 파일의 위치, 이름 설정
dir_path = ""
filetime = strftime("%Y-%m-%d_%H-%M-%S", localtime())
filename = "NEWS_" + filetime
savefile = open(dir_path + filename, mode="w", encoding="utf-8")
print("* * * * * * * * * * * * * * * * * ", filename, " * * * * * * * * * * * * * * * * *\n")


### 스크랩
print("☆ NEWS")
savefile.write("☆ NEWS\n")

news_rep_arr = []
news_rep_cnt = 0
newscnt = 0
try:
    while newscnt >= 0 and newscnt < len(newsurl):
        # send url + header
        r = requests.get(newsurl[newscnt], headers=headers)
        if r.status_code != 200:
            continue
        r.encoding = "utf-8"

        if newscnt == 0:
            news_title = "  [NAVER news - 제3 인터넷 전문 은행]"
        elif newscnt == 1:
            news_title = "  [NAVER news - 아마존]"
        else:
            break

        print(news_title, end="")
        savefile.write("\n" + news_title + "\n")

        # analysis
        soup = BeautifulSoup(r.text, "html.parser")

        ### 언론사, 기사 제목, 날짜, URL 추출하기
        # separate tag
        contenthtml = soup.find_all("item")

        incnt = 0
        totalcnt = 0
        while incnt < contenthtml.__len__():
            contentstr = str(contenthtml.__getitem__(incnt))
            contentstr = str(contentstr).replace("link/", "a")
            contentstr = str(contentstr).replace(" <description>", "</a> <description>")
            contentstr = BeautifulSoup(contentstr, "html.parser")
            incnt += 1

            ### 언론사 추출
            authorstr = contentstr.find_all("author")
            author = (authorstr.__getitem__(0)).get_text()
            author = "[" + author + "]"

            ### 기사 제목 추출
            titlestr = contentstr.find_all("title")
            title = (titlestr.__getitem__(0)).get_text()
            title = str(title).replace("&quot;", "\"")
            title = str(title).replace("&apos;", "\'")

            ### 날짜 추출 및 형식 변경
            datestr = contentstr.find_all("pubdate")
            date = (datestr.__getitem__(0)).get_text()

            date = str(date).replace(",", "")
            date = date.replace("Jan", "01")
            date = date.replace("Feb", "02")
            date = date.replace("Mar", "03")
            date = date.replace("Apr", "04")
            date = date.replace("May", "05")
            date = date.replace("Jun", "06")
            date = date.replace("Jul", "07")
            date = date.replace("Aug", "08")
            date = date.replace("Sep", "09")
            date = date.replace("Oct", "10")
            date = date.replace("Nov", "11")
            date = date.replace("Dec", "12")

            date_split = date.split(" ")
            date_temp = date_split[3], date_split[2], date_split[1]
            date = "-".join(date_temp)
            date = date + " " + date_split[4]

            if not (date_today in date) and not (date_yesterday in date) and (date != ""):
                continue

            ### URL 추출
            urlstr = contentstr.find_all("a")
            url = (urlstr.__getitem__(0)).get_text()

            ### 추출한 데이터 재가공
            temp = "   ", author, title, "\t|", date, "\n\t\t\t\t", url
            output = " ".join(temp)

            ### 추출한 데이터 → 파일에 쓰기(write)
            for i in range(0, news_rep_cnt + 1):
                if news_rep_cnt == 0:
                    news_rep_arr.append(output)
                    savefile.write(output + "\n")
                    news_rep_cnt += 1
                    totalcnt += 1
                elif news_rep_arr[i] == output:
                    break
                elif (news_rep_arr[i] != output) and (i + 1 == news_rep_cnt):
                    news_rep_arr.append(output)
                    savefile.write(output + "\n")
                    news_rep_cnt += 1
                    totalcnt += 1
                    break

        print(" : ", totalcnt)
        newscnt += 1
except:
    print("ERROR")

print(("\n**************************************************************************************************"))

savefile.close()

 

Back To Top