본문 바로가기

동방프로젝트

실전 - 크롤링(식품안전나라 with api)

import datetime, re
import pymysql
import json
from urllib.request import urlopen
from bs4 import BeautifulSoup

key = '제공받은 키'
conn = pymysql.connect(host='db 주소',\
 user = 'root', passwd='db 루트의 비번', db='mysql', port = 3306,
charset = 'utf8')
cur = conn.cursor()
cur.execute("USE foodvar")
"""
def store(title, content) :
    cur.execute(
        "INSERT INTO pages(title, content) values (\"%s\", \"%s\")", (title,content)
    )

    cur.connection.commit()

def getLinks(articleUrl) :
    html = urlopen("http://en.wikipedia.org"+ articleUrl)
    bsObj = BeautifulSoup(html, "html.parser")
    # title = bsObj.find("h1").find("span").get_text()
    title = bsObj.find("h1").get_text()
    
    content = bsObj.find("div", {"id": "mw-content-text"}).find("p").get_text()
    store(title, content)
    return bsObj.find("div", {"id": "bodyContent"}).findAll("a", 
    href = re.compile("^(/wiki/)((?!:).)*$"))

links = getLinks("/wiki/Donald_Trump")
try:
    while len(links) > 0:
        newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
        print(newArticle)
        links = getLinks(newArticle)
finally:
    cur.close()
    conn.close()
"""

startRow = 1
endRow = 1000 #59886
tmp = 1000

for i in range(60) :
    url = 'https://openapi.foodsafetykorea.go.kr/api/' + key + '/I2790/json/' + \
        str(startRow) + '/' + str(tmp)
    res = urlopen(url).read().decode('utf-8')
    responseJson = json.loads(res).get("I2790").get("row")

    for j in range(len(responseJson)) :
        num = responseJson[j].get("NUM")
        code = responseJson[j].get("FOOD_CD")
        genre = responseJson[j].get("GROUP_NAME")
        name = responseJson[j].get("DESC_KOR")
        size = responseJson[j].get("SERVING_SIZE")
        kcal = responseJson[j].get("NUTR_CONT1")
        carbs = responseJson[j].get("NUTR_CONT2")
        prot = responseJson[j].get("NUTR_CONT3")
        fat = responseJson[j].get("NUTR_CONT4")
        sugar = responseJson[j].get("NUTR_CONT5")
        nat = responseJson[j].get("NUTR_CONT6")
        chole = responseJson[j].get("NUTR_CONT7")
        satur = responseJson[j].get("NUTR_CONT8")
        trans = responseJson[j].get("NUTR_CONT9")

        cur.execute("INSERT INTO nutri(num, code, genre, name, size, kcal, carbs, prot,\
            fat, sugar, nat, chole, satur, trans) values (\"%s\", \"%s\", \"%s\", \"%s\", \
           \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\")", \
           (num, code, genre, name, size, kcal, carbs, prot, fat, sugar, nat, chole,
           satur, trans))
        cur.connection.commit()
    #print(code, genre, name, size, kcal, carbs, prot, fat, sugar, nat, chole, satur, trans)
    startRow = tmp + 1
    tmp += 1000

cur.close()
conn.close()
#url = 'https://openapi.foodsafetykorea.go.kr/api/' + key + '/I2790/json/' + str(1) + '/' + str(2)
#res = urlopen(url).read().decode('utf-8')
#responseJson = json.loads(res)
#print(responseJson.get("I2790").get("row")[0].get("DESC_KOR"))

"""
1	NUM	번호
2	FOOD_CD	식품코드
7	GROUP_NAME	식품군
8	DESC_KOR	식품이름
12	SERVING_SIZE	총내용량
13	NUTR_CONT1	열량(kcal)(1회제공량당)
14	NUTR_CONT2	탄수화물(g)(1회제공량당)
15	NUTR_CONT3	단백질(g)(1회제공량당)
16	NUTR_CONT4	지방(g)(1회제공량당)
17	NUTR_CONT5	당류(g)(1회제공량당)
18	NUTR_CONT6	나트륨(mg)(1회제공량당)
19	NUTR_CONT7	콜레스테롤(mg)(1회제공량당)
20	NUTR_CONT8	포화지방산(g)(1회제공량당)
21	NUTR_CONT9	트랜스지방(g)(1회제공량당)
"""

'동방프로젝트' 카테고리의 다른 글

크롤러 8  (0) 2021.08.17
크롤링 7  (0) 2021.08.17
크롤러 5  (0) 2021.08.09
크롤러 3  (0) 2021.08.07
파이썬 웹크롤러2  (0) 2021.08.06