基于Python、Node.js和MongoDB的猫眼电影数据爬取与可视化分析系统

目录
实验三 数据库实现设计 1
一、实验目的 1
二、实验内容 1
3. 开展接口设计,其中包括外部接口、内部接口等。 2
5. 进行开展最重要的实现,数据库实现和界面实现。 2
三、实验要求: 2
四.设计流程和实现方案 2
1 引言 2
1.1 编写目的 2
1.2 背景 2
2.1功能 3
2.2性能 4
2.3 安全保密 4
3 运行环境 4
3.1硬件平台 5
3.2支持软件 5
3.3 数据库 5
4 软件设计 5
4.1 数据爬取模块 5
4.2 服务器搭建 9
4.3 电影数据展示 10
4.4 电影推荐 11
5 使用过程 12
5.1 启动服务器 12
5.3 电影推荐 14

一、实验目的
按照设计要求,把构件、类等组装成为最终可交付使用的信息系统工作。同时,功能的关键都是与数据库的连接,如何正确有效地连接到数据库,查找,添加,修改数据是此软件的关键,是各类数据库应用分析、开发、调试的全过程,并能以一个系统分析和设计员的身份参与项目,提高学生独立分析问题和解决问题的能力。
二、实验内容
1.采用何种方式连接数据库,使数据库能够尽可能的适用于系统,方便开发。目标是正确有效地连接到数据库,能够对数据库进行有效地查询,添加,修改,删除操作。
2.开展类的实现,编写类的程序代码,并放入到构件之中。包括生成类、类属性代码和类操作代码等。
3.开展接口设计,其中包括外部接口、内部接口等。
4.实施数据结构实现,包括逻辑结构实现、物理结构实现、数据结构与程序代码的关系。
5.进行开展最重要的实现,数据库实现和界面实现。
6.最后实现或者集成子系统,按照设计要求,把子系统组装成为最终可交付使用的信息系统工作。
三、实验要求:
包括确定系统的实现结构,子系统、类和接口的实现,单元测试,系统集成等。具体要求是需要确定实现结构,然后再制定实现的迭代计划。接下来通过多次迭代实现各个子系统和每一个子系统中的类和接口,并进行单元测试。并把每次迭代的结果交进行系统集成,通过多次迭代完成实现最终系统。

import re
import time
import random
import pymysql
import requests
from bs4 import BeautifulSoup
from fontTools.ttLib import TTFont
import mongodb
import mysqldb

head = """
Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
Accept-Encoding:gzip, deflate, br
Accept-Language:zh-CN,zh;q=0.8
Cache-Control:max-age=0
Connection:keep-alive
Host:maoyan.com
Upgrade-Insecure-Requests:1
Content-Type:application/x-www-form-urlencoded; charset=UTF-8
User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36
"""


def str_to_dict(header):
    """
    构造请求头,可以在不同函数里构造不同的请求头
    """
    header_dict = {}
    header = header.split('\n')
    for h in header:
        h = h.strip()
        if h:
            k, v = h.split(':', 1)
            header_dict[k] = v.strip()
    return header_dict


def get_url():
    """
    获取电影详情页链接
    """
    for i in range(150, 300, 30):

        # 随机一个时间间隔,random() 返回0-1 的数,以免IP地址被封
        time.sleep(random.random() * 4)
        url = 'http://maoyan.com/films?showType=3&yearId=13&sortId=3&offset=' + str(i)
        host = """Referer:http://maoyan.com/films?showType=3&yearId=13&sortId=3&offset=0
        """
        header = head + host
        headers = str_to_dict(header)
        response = requests.get(url=url, headers=headers)


        # soup = BeautifulSoup(html, 'html.parser')
        data_1 = soup.find_all('div', {'class': 'channel-detail movie-item-title'})
        data_2 = soup.find_all('div', {'class': 'channel-detail channel-detail-orange'})
        num = 0
        for item in data_1:
            num += 1
            time.sleep(10)
            url_1 = item.select('a')[0]['href']
            if data_2[num - 1].get_text() != '暂无评分':
                url = 'http://maoyan.com' + url_1
                for message in get_message(url):
                    # print(message)
                    # 向mysql 写数据
                    # to_mysql(message)
                    # 向mongo 写数据
                    to_mymongo(message)
                print(url)
                print('---------------^^^Film_Message^^^-----------------')
            else:
                print('The Work Is Done')
                break


def get_message(url):
    """
    获取电影详情页里的信息
    """
    # time.sleep(10)
    time.sleep(random.random() * 4)
    data = {}
    host = """refer: http://maoyan.com/news
    """
    header = head + host
    headers = str_to_dict(header)
    response = requests.get(url=url, headers=headers)
    u = response.text
    # 破解猫眼文字反爬
    (maoyan_num_list, utf8last) = get_numbers(u)
    # 获取电影信息
    soup = BeautifulSoup(u, "html.parser")
    mw = soup.find_all('span', {'class': 'stonefont'})
    score = soup.find_all('span', {'class': 'score-num'})
    unit = soup.find_all('span', {'class': 'unit'})
    ell = soup.find_all('li', {'class': 'ellipsis'})
    name = soup.find_all('h3', {'class': 'name'})

    user = soup.find_all('span', {'class': 'name'})
    time_comment = soup.find_all('div', {'class': 'time'})
    score_start = soup.find_all('ul', {'class': 'score-star clearfix'})
    comment_approve = soup.find_all('span', {'class': 'num'})
    comment_content = soup.find_all('div', {'class': 'comment-content'})
    # print(time_comment[0].find('span').get_text())

    # 返回电影信息
    # _id +=1

    data["name"] = name[0].get_text()
    data["type"] = ell[0].get_text()
    data["country"] = ell[1].get_text().split('/')[0].strip().replace('\n', '')
    data["length"] = ell[1].get_text().split('/')[1].strip().replace('\n', '')
    data["released"] = ell[2].get_text()[:10]

    # 因为会出现没有票房的电影,所以这里需要判断
    if unit:
        bom = ['分', score[0].get_text().replace('.', '').replace('万', ''), unit[0].get_text()]
        for i in range(len(mw)):
            moviewish = mw[i].get_text().encode('utf-8')
            moviewish = str(moviewish, encoding='utf-8')
            # 通过比对获取反爬文字信息
            for j in range(len(utf8last)):
                moviewish = moviewish.replace(utf8last[j], maoyan_num_list[j])
            if i == 0:
                data["score"] = moviewish + bom[i]
            elif i == 1:
                if '万' in moviewish:
                    data["people"] = int(float(moviewish.replace('万', '')) * 10000)
                else:
                    data["people"] = int(float(moviewish))
            else:
                if '万' == bom[i]:
                    data["box_office"] = int(float(moviewish) * 10000)
                else:
                    data["box_office"] = int(float(moviewish) * 100000000)
    else:
        bom = ['分', score[0].get_text().replace('.', '').replace('万', ''), 0]
        for i in range(len(mw)):
            moviewish = mw[i].get_text().encode('utf-8')
            moviewish = str(moviewish, encoding='utf-8')
            for j in range(len(utf8last)):
                moviewish = moviewish.replace(utf8last[j], maoyan_num_list[j])
            if i == 0:
                data["score"] = moviewish + bom[i]
            else:
                if '万' in moviewish:
                    data["people"] = int(float(moviewish.replace('万', '')) * 10000)
                else:
                    data["people"] = int(float(moviewish))
        data["box_office"] = bom[2]

    # 在最后添加用户评论
    # data["user"] = []
    # data["time_comment"] = []
    # data["score_start"] = []
    # data["comment_approve"] = []
    # data["comment_content"] = []
    #
    # numbre = len(user)
    # for i in range(numbre):
    #     data["user"].append(user[i].get_text())
    #     data["time_comment"].append(time_comment[i].find('span').get_text())
    #     data["score_start"].append(score_start[i].get('data-score'))
    #     data["comment_approve"].append(comment_approve[i].get_text())
    #     data["comment_content"].append(comment_content[i].get_text())

    yield data


def to_mysql(data):
    """
    信息写入mysql
    """
    # 创建猫眼数据库,输入你mysql 数据库的用户名,和密码,创建maoyao 数据库
    maoyandb = mysqldb(user='root', password='111111');
    table1 = 'films'
    # table2 = 'users'
    keys = ', '.join(data.keys())
    # keys = 'name,type,country,length,released,score,people,box office'
    # values1='%s'
    values = ', '.join(['%s'] * len(data))
    db = pymysql.connect(host='localhost', user='root', password='111111', port=3306, db='maoyan')

    cursor = db.cursor()
    sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(table=table1, keys=keys, values=values)
    # sql1 = 'INSERT INTO {table} VALUES ({values})'.format(table=table1, keys=keys, values=values)
    try:
        if cursor.execute(sql, tuple(data.values())):
            print("Successful")
            db.commit()
    except:
        print('Failed')
        db.rollback()
    db.close()


def to_mymongo(data):
    # 已经连接到MongoDB数据库了
    # dbname 数据库名字,collectionName 电影表
    dbname = 'maoyan'
    collectionName = 'testmaoyan'
    db = mongodb.mongoDB(dbname, collectionName)

    # 使用默认_id
    item = db.process_item(data, True);
    print(item)


def get_numbers(u):
    """
    对猫眼的文字反爬进行破解
    """
    cmp = re.compile(",\n           url\('(//.*.woff)'\) format\('woff'\)")
    rst = cmp.findall(u)
    ttf = requests.get("http:" + rst[0], stream=True)
    with open("maoyan.woff", "wb") as pdf:
        for chunk in ttf.iter_content(chunk_size=1024):
            if chunk:
                pdf.write(chunk)
    base_font = TTFont('base.woff')
    maoyanFont = TTFont('maoyan.woff')
    maoyan_unicode_list = maoyanFont['cmap'].tables[0].ttFont.getGlyphOrder()
    maoyan_num_list = []
    base_num_list = ['.', '3', '0', '8', '9', '4', '1', '5', '2', '7', '6']
    # base_unicode_list = ['x', 'uniF561', 'uniE6E1', 'uniF125', 'uniF83F', 'uniE9E2', 'uniEEA6', 'uniEEC2', 'uniED38', 'uniE538', 'uniF8E7']
    base_unicode_list = ['x', 'uniF849', 'uniE581', 'uniF178', 'uniF533',
                         'uniEC0F', 'uniED67', 'uniEF38', 'uniE223', 'uniF7C6', 'uniF89D']
    for i in range(1, 12):
        maoyan_glyph = maoyanFont['glyf'][maoyan_unicode_list[i]]
        for j in range(11):
            base_glyph = base_font['glyf'][base_unicode_list[j]]
            if maoyan_glyph == base_glyph:
                maoyan_num_list.append(base_num_list[j])
                break
    maoyan_unicode_list[1] = 'uni0078'
    utf8List = [eval(r"'\u" + uni[3:] + "'").encode("utf-8") for uni in maoyan_unicode_list[1:]]
    utf8last = []
    for i in range(len(utf8List)):
        utf8List[i] = str(utf8List[i], encoding='utf-8')
        utf8last.append(utf8List[i])
    return (maoyan_num_list, utf8last)


def main():
    time.sleep(random.random() * 3)
    get_url()


if __name__ == '__main__':
    main()

































作者:shejizuopin

物联沃分享整理
物联沃-IOTWORD物联网 » 基于Python、Node.js和MongoDB的猫眼电影数据爬取与可视化分析系统

发表回复