河南省住建厅网站官网包装设计接单网站

当前位置: 首页 > news >正文

河南省住建厅网站官网,包装设计接单网站,建设网站要做的工作内容,京东企业集团网站建设方案Python网络爬虫基础 数据持久化#xff08;数据保存#xff09;1. Excel创建数据表批量数据写入读取表格数据案例 - 豆瓣保存 Excel案例 - 网易新闻Excel保存 2. Json数据序列化和反序列化中文指定案例 - 豆瓣保存Json案例 - Json保存 3. Csv写入csv列表数据案例 - 豆瓣列表保… Python网络爬虫基础 数据持久化数据保存1. Excel创建数据表批量数据写入读取表格数据案例 - 豆瓣保存 Excel案例 - 网易新闻Excel保存 2. Json数据序列化和反序列化中文指定案例 - 豆瓣保存Json案例 - Json保存 3. Csv写入csv列表数据案例 - 豆瓣列表保存Csv 写入csv字典数据案例 - 豆瓣字典保存csv 读取csv数据案例 - 网易新闻csv 数据持久化数据保存

  1. Excel 创建数据表 import openpyxl # 第三方模块, pip install openpyxl# 1.创建一个工作簿对象 work_book openpyxl.Workbook()# 2.创建表对象 sheet1 work_book.create_sheet(表1)

    如果使用默认的表操作数据, 需要调用工作簿对象的active属性

    sheet1 work_book.active# 3.操作表中单元格写入数据 sheet1[A1] A1 sheet1[B7] B7# cell – 单元格对象, row表示行, column表示列 sheet1.cell(row1, column1).value 111111 sheet1.cell(row2, column2).value 222222data1 (1, 2, 3, 4, 5)

    data2 45678# sheet1.append(序列数据) 整行添加数据到表格中去, 括号内部传递序列数据(列表/元祖)

    通过数据的第一次和第二次数据提取, 会提取到一条一条的数据

    sheet1.append(data1)

    sheet1.append(data2)# 4.保存

    work_book.save(实例.xlsx)批量数据写入 import openpyxlwork openpyxl.Workbook() sheet1 work.activefor i in range(1, 10):for j in range(1, i 1):print(f{j} x {i} {j * i}, end\t)sheet1.cell(rowi, columnj).value f{j} x {i} {j * i}print()work.save(实例.xlsx)读取表格数据 import openpyxlworkbook openpyxl.load_workbook(实例.xlsx)print(workbook.sheetnames)sheet workbook[Sheet] # 指定表读取print(sheet.max_row) # 最大行 print(sheet.max_column) # 最大列# 读取第一行 for i in range(1, sheet.max_column 1):print(sheet.cell(row1, columni).value) # 单元格为空就返回None# 读取第一列 for j in range(1, sheet.max_row 1):print(sheet.cell(rowj, column1).value) # 单元格为空就返回Nonefor i in range(1, sheet.max_column 1):for j in range(1, sheet.max_row 1):print(sheet.cell(rowi, columnj).value)案例 - 豆瓣保存 Excel import parsel import requests import openpyxl# 3.操作表中单元格写入数据

    4.保存# 1.创建一个工作簿对象

    work openpyxl.Workbook()

    2.创建表对象

    sheet1 work.active

    写表头? √sheet1.append([标题, 简介, 评分, 评价人数])for page in range(0, 226, 25):url fhttps://movie.douban.com/top250?start{page}filterheaders {Cookie: ll118267; bidVrC8tT1GWz8; __yadk_uidiHqVKZD4ZHIVREbOrlu9k4uWFSsAdZtO; _pk_id.100001.4cf6b39d476add4f5658.1683638062.; __utmz30149280.1687782730.8.7.utmcsrbaidu|utmccn(organic)|utmcmdorganic; utmz223695111.1687782730.4.4.utmcsrbaidu|utmccn(organic)|utmcmdorganic; _pk_ref.100001.4cf6%5B%22%22%2C%22%22%2C1687952054%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DqdlD_RZvrHI0sXUZ08wSSKbkKLAWA_R84aALUkbWwpyA2hUL-2C_Ej15saTpe7%26wd%3D%26eqid%3Dfdfaeaeb0001b3f60000000664998548%22%5D; _pk_ses.100001.4cf61; ap_v0,6.0; __utma30149280.1169382564.1682168622.1687782730.1687952054.9; __utmb30149280.0.10.1687952054; __utmc30149280; __utma223695111.1640817040.1683638062.1687782730.1687952054.5; __utmb223695111.0.10.1687952054; __utmc223695111; __gadsID744f53c3cb2ebb52-22841ef3a4e00021:T1683638065:RT1687952056:SALNI_MZhRKuML1OBDnNRafe3qd6-ndhaiQ; __gpiUID00000c03bafcda5c:T1683638065:RT1687952056:SALNI_MbkLLsUm467wiS6ZZ6Mn2ohKIWBZw,Host: movie.douban.com,User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36,}response requests.get(urlurl, headersheaders)html_data response.text# print(html_data)解析数据# 转对象selector parsel.Selector(html_data)# 第一次提取lis selector.css(.grid_viewli)# 二次提取for li in lis:title li.css(.hdaspan:nth-child(1)::text).get()info li.css(.bdp:nth-child(1)::text).getall()info //.join([i.strip() for i in info])score li.css(.rating_num::text).get()follow li.css(.starspan:nth-child(4)::text).get()print(title, info, score, follow)# 调用append方法写入每一条数据# 写表头? xsheet1.append([title, info, score, follow])print( * 100 \n)# 写表头? x

    work.save(douban.xlsx)# 编码

    office软件中Excel文件使用的编码是gbk

    wps软件使用的编码是 utf-8案例 - 网易新闻Excel保存 目标站点:https://news.163.com/往下翻有 要闻 这个新闻类目, 找不到可以 Ctrl F 搜索下需求:爬取网易新闻 要闻 类目第一页数据将数据保存为 Excel 表格保存字段需要以下内容titlechannelname docurl imgurl source tlinkimport json

    import reimport requests import openpyxlurl https://news.163.com/special/cm_yaowen20200213/?callbackdata_callback response requests.get(urlurl) json_data response.text

    print(json_data)result re.findall(data_callback((.*?)), json_data, re.S)

    print(result)item_json json.loads(result[0])

    print(item_json)

    print(type(item_json))work openpyxl.Workbook()

    sheet1 work.active sheet1.append([title, channelname, docurl, imgurl, source, tlink])for item in item_json:title item[title]channelname item[channelname]docurl item[docurl]imgurl item[imgurl]source item[source]tlink item[tlink]print(title, channelname, docurl, imgurl, source, tlink, sep | )sheet1.append([title, channelname, docurl, imgurl, source, tlink])work.save(网易新闻.xlsx)2. Json 数据序列化和反序列化 import json # 内置# [] {} data {name: ACME,shares: 100,price: 542.23 } json序列化: 将对象转化成json字符串 dumps() 序列化json字符串json_str json.dumps(data) print(json_str) print(type(json_str)) json反序列化: 将json字符串转化成对象 dumps() 序列化json字符串json_obj json.loads(json_str) print(json_obj) print(type(json_obj))中文指定 import jsondata {name: 青灯,shares: 100,price: 542.23 }

    json字符串默认使用unicode编码, 无法显示中文

    ensure_asciiFalse 不适用默认编码json_str json.dumps(data, ensure_asciiFalse)

    with open(data.json, modew, encodingutf-8) as f:f.write(json_str)案例 - 豆瓣保存Json import jsonimport parsel import requests import openpyxldata [] # 定义一个空列表, 用于收集每一条数据for page in range(0, 226, 25):url fhttps://movie.douban.com/top250?start{page}filterheaders {Cookie: ll118267; bidVrC8tT1GWz8; __yadk_uidiHqVKZD4ZHIVREbOrlu9k4uWFSsAdZtO; _pk_id.100001.4cf6b39d476add4f5658.1683638062.; __utmz30149280.1687782730.8.7.utmcsrbaidu|utmccn(organic)|utmcmdorganic; utmz223695111.1687782730.4.4.utmcsrbaidu|utmccn(organic)|utmcmdorganic; _pk_ref.100001.4cf6%5B%22%22%2C%22%22%2C1687952054%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DqdlD_RZvrHI0sXUZ08wSSKbkKLAWA_R84aALUkbWwpyA2hUL-2C_Ej15saTpe7%26wd%3D%26eqid%3Dfdfaeaeb0001b3f60000000664998548%22%5D; _pk_ses.100001.4cf61; ap_v0,6.0; __utma30149280.1169382564.1682168622.1687782730.1687952054.9; __utmb30149280.0.10.1687952054; __utmc30149280; __utma223695111.1640817040.1683638062.1687782730.1687952054.5; __utmb223695111.0.10.1687952054; __utmc223695111; __gadsID744f53c3cb2ebb52-22841ef3a4e00021:T1683638065:RT1687952056:SALNI_MZhRKuML1OBDnNRafe3qd6-ndhaiQ; __gpiUID00000c03bafcda5c:T1683638065:RT1687952056:SALNI_MbkLLsUm467wiS6ZZ6Mn2ohKIWBZw,Host: movie.douban.com,User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36,}response requests.get(urlurl, headersheaders)html_data response.text# print(html_data)解析数据# 转对象selector parsel.Selector(html_data)# 第一次提取lis selector.css(.grid_viewli)# 二次提取for li in lis:title li.css(.hdaspan:nth-child(1)::text).get()info li.css(.bdp:nth-child(1)::text).getall()info //.join([i.strip() for i in info])score li.css(.rating_num::text).get()follow li.css(.starspan:nth-child(4)::text).get()# print(title, info, score, follow)d {title: title, info: info, score: score, follow: follow}data.append(d)# print( * 100 \n)print(data)# json数据的序列化 json_str json.dumps(data, ensure_asciiFalse) with open(douban.json, modew, encodingutf-8) as f:f.write(json_str)# [{}, {}, {}……]案例 - Json保存 目标网址https://www.ku6.com/video/feed?pageNo0pageSize40subjectId76请求方式: GET要求1、请求上述网址的数据2、把获取到的数据保存到json文件中文件命名: data.json需要在文件中看到json字符串请在下方编写代码import requestsurl https://www.ku6.com/video/feed?pageNo0pageSize40subjectId76 response requests.get(urlurl) json_data response.text print(json_data)with open(data.json, modew, encodingutf-8) as f:f.write(json_data)# json序列化-3. Csv 写入csv列表数据 csv数据格式:每一行是一条数据每一行中每个数据字段有分隔符号, 默认为逗号import csv # 内置data [[1, 2, 3, 4],[1, 2, 3, 4],[5, 6, 7, 8],[5, 6, 7, 8] ]with open(data.csv, modea, encodingutf-8, newline) as f:# newline 指定数据新行是一个空字符串, 不然保存会有数据空行# csv.writer(f) 实例化一个csv数据的写入对象, 括号内部传递文件对象csv_write csv.writer(f)for i in data:# writerow(i) 把数据一行一行一条一条写入, 传入(列表/元组)csv_write.writerow(i)案例 - 豆瓣列表保存Csv import csv import jsonimport parsel import requests import openpyxl# 上下文管理器 with open(douban-list.csv, modea, encodingutf-8, newline) as f:csv_write csv.writer(f)# csv_write.writerow([标题, 简介, 平分, 评论人数])f.write(标题,简介,平分,评论人数\n)for page in range(0, 226, 25):url fhttps://movie.douban.com/top250?start{page}filterheaders {Cookie: ll118267; bidVrC8tT1GWz8; __yadk_uidiHqVKZD4ZHIVREbOrlu9k4uWFSsAdZtO; _pk_id.100001.4cf6b39d476add4f5658.1683638062.; __utmz30149280.1687782730.8.7.utmcsrbaidu|utmccn(organic)|utmcmdorganic; utmz223695111.1687782730.4.4.utmcsrbaidu|utmccn(organic)|utmcmdorganic; _pk_ref.100001.4cf6%5B%22%22%2C%22%22%2C1687952054%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DqdlD_RZvrHI0sXUZ08wSSKbkKLAWA_R84aALUkbWwpyA2hUL-2C_Ej15saTpe7%26wd%3D%26eqid%3Dfdfaeaeb0001b3f60000000664998548%22%5D; _pk_ses.100001.4cf61; ap_v0,6.0; __utma30149280.1169382564.1682168622.1687782730.1687952054.9; __utmb30149280.0.10.1687952054; __utmc30149280; __utma223695111.1640817040.1683638062.1687782730.1687952054.5; __utmb223695111.0.10.1687952054; __utmc223695111; __gadsID744f53c3cb2ebb52-22841ef3a4e00021:T1683638065:RT1687952056:SALNI_MZhRKuML1OBDnNRafe3qd6-ndhaiQ; __gpiUID00000c03bafcda5c:T1683638065:RT1687952056:SALNI_MbkLLsUm467wiS6ZZ6Mn2ohKIWBZw,Host: movie.douban.com,User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36,}response requests.get(urlurl, headersheaders)html_data response.text# print(html_data)解析数据# 转对象selector parsel.Selector(html_data)# 第一次提取lis selector.css(.grid_viewli)# 二次提取for li in lis:title li.css(.hdaspan:nth-child(1)::text).get()info li.css(.bdp:nth-child(1)::text).getall()info //.join([i.strip() for i in info])score li.css(.rating_num::text).get()follow li.css(.starspan:nth-child(4)::text).get()print(title, info, score, follow)# 循环写入数据csv_write.writerow([title, info, score, follow])print( * 100 \n) 写入csv字典数据 csv数据格式:每一行是一条数据每一行中每个数据字段有分隔符号, 默认为逗号 import csv # 内置list_dict [{first_name: Baked, last_name: Beans},{first_name: Lovely},{first_name: Wonderful, last_name: Spam}]with open(data.csv, modea, encodingutf-8, newline) as f:# 创建一个字典数据写入对象, 第一个参数是文件对象, 第二个参数是字典中的键# fieldnames 指定字典的键, 不能多不能少不能错csv_write csv.DictWriter(f, fieldnames[first_name, last_name])# 字典数据会有专门写表头的方法csv_write.writeheader()for i in list_dict:csv_write.writerow(i)案例 - 豆瓣字典保存csv import csv import jsonimport parsel import requests import openpyxlwith open(douban-dict.csv, modea, encodingutf-8, newline) as f:csv_write csv.DictWriter(f, fieldnames[title, info, score, follow])csv_write.writeheader() # 写表头, 只有字典数据有写表头的方法,列表没有方法写表头for page in range(0, 226, 25):url fhttps://movie.douban.com/top250?start{page}filterheaders {Cookie: ll118267; bidVrC8tT1GWz8; __yadk_uidiHqVKZD4ZHIVREbOrlu9k4uWFSsAdZtO; _pk_id.100001.4cf6b39d476add4f5658.1683638062.; __utmz30149280.1687782730.8.7.utmcsrbaidu|utmccn(organic)|utmcmdorganic; utmz223695111.1687782730.4.4.utmcsrbaidu|utmccn(organic)|utmcmdorganic; _pk_ref.100001.4cf6%5B%22%22%2C%22%22%2C1687952054%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DqdlD_RZvrHI0sXUZ08wSSKbkKLAWA_R84aALUkbWwpyA2hUL-2C_Ej15saTpe7%26wd%3D%26eqid%3Dfdfaeaeb0001b3f60000000664998548%22%5D; _pk_ses.100001.4cf61; ap_v0,6.0; __utma30149280.1169382564.1682168622.1687782730.1687952054.9; __utmb30149280.0.10.1687952054; __utmc30149280; __utma223695111.1640817040.1683638062.1687782730.1687952054.5; __utmb223695111.0.10.1687952054; __utmc223695111; __gadsID744f53c3cb2ebb52-22841ef3a4e00021:T1683638065:RT1687952056:SALNI_MZhRKuML1OBDnNRafe3qd6-ndhaiQ; __gpiUID00000c03bafcda5c:T1683638065:RT1687952056:SALNI_MbkLLsUm467wiS6ZZ6Mn2ohKIWBZw,Host: movie.douban.com,User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36,}response requests.get(urlurl, headersheaders)html_data response.text# print(html_data)解析数据# 转对象selector parsel.Selector(html_data)# 第一次提取lis selector.css(.grid_viewli)# 二次提取for li in lis:title li.css(.hdaspan:nth-child(1)::text).get()info li.css(.bdp:nth-child(1)::text).getall()info //.join([i.strip() for i in info])score li.css(.rating_num::text).get()follow li.css(.starspan:nth-child(4)::text).get()print(title, info, score, follow)d {title: title, info: info, score: score, follow: follow}csv_write.writerow(d)print( * 100 \n)读取csv数据 import csv基于字符串文件类型直接读取

    with open(data.csv, moder, encodingutf-8) as f:

    print(f.read())读取返回列表

    with open(douban-list.csv, moder, encodingutf-8) as f:

    csv_read csv.reader(f)

    print(csv_read)

    for i in csv_read:

    print(i)读取返回字典对象的方法

    with open(douban-list.csv, moder, encodingutf-8) as f:csv_read csv.DictReader(f)print(csv_read)for i in csv_read:print(i)案例 - 网易新闻csv 目标站点:https://news.163.com/ 往下翻有 要闻 这个新闻类目需求:爬取网易新闻 要闻 类目第一页数据将数据保存为csv格式保存字段需要以下内容title channelname docurl imgurl source tlinkimport csv import json import re import requests import openpyxlurl https://news.163.com/special/cm_yaowen20200213/?callbackdata_callback response requests.get(urlurl) json_data response.textresult re.findall(data_callback((.*?)), json_data, re.S)item_json json.loads(result[0])with open(网易新闻.csv, modea, encodingutf-8, newline) as f:write csv.writer(f)write.writerow([title, channelname, docurl, imgurl, source, tlink])for item in item_json:title item[title]channelname item[channelname]docurl item[docurl]imgurl item[imgurl]source item[source]tlink item[tlink]print(title, channelname, docurl, imgurl, source, tlink, sep | )write.writerow([title, channelname, docurl, imgurl, source, tlink])