#!/usr/bin/env python # -*- coding: utf-8 -*- """ @author: Deepcold @file: deal_bidder.py @time: 2019/7/4 9:57 """ import re from bson import ObjectId from pymongo import MongoClient class BidInfo(object): def __init__(self, date): self.client = MongoClient(['dds-uf6ff5dfd9aef3641601-pub.mongodb.rds.aliyuncs.com:3717', 'dds-uf6ff5dfd9aef3642555-pub.mongodb.rds.aliyuncs.com:3717'], replicaSet='mgset-6501997') # 授权. 这里的user基于admin数据库授权 self.client["itslaw"].authenticate('itslaw', 'itslaw_168') self.db_new = self.client["itslaw"].WinBidInfo # self.db_new = self.client["itslaw"].WinBidInfo_copy1 # 获取所有数据的游标对象 self.response = self.db_new.find({"spider_time": {"$gte": date}}) self.new_item = {} def parse_data(self): # 遍历游标对象,获取所有数据 for index, content in enumerate(self.response): print(index) if "new_bidder" in content.keys(): pass else: bidder = content["bidder"] code = content["code"] _id = str(content["_id"]) new_bidder = self.parse_bidder(bidder, code) new_values = {"$set": {"new_bidder": new_bidder}} self.db_new.update_one(content, new_values) @staticmethod def parse_bidder(bidder, code): # 中文符号换成英文符号 punctuation_en = u',。、!?:;【】()%#@&1234567890' punctuation_zh = u',..!?:;[]()%#@&1234567890' punctuation = str.maketrans(punctuation_en, punctuation_zh) bidder = bidder.translate(punctuation) # 统一中标人名称 bidder = re.sub(r"中标供应商名称|中标人名称|中标单位名称|中标商名称|中标人|供应商|成交供应商名称|" "成交人名称|成交人|中标商|中标\(成交\)供应商名称|成交单位|中标企业|中标人为", "中标单位", bidder) if "废标" in bidder: return "" else: text_list = re.split(r"[,.!?;\\*—/]", bidder) company = [] for text in text_list: if 22 >= len(text) >= 6 and text.endswith("公司"): if ":" in text: temp = re.findall("中标单位为*:*([\u4E00-\u9FA5()]{4,20}?公司)", text) company.extend(temp) else: if "中标" in text or "候选" in text: pass else: city_list = ["河北", "石家庄", "张家口", "承德", "唐山", "秦皇岛", "廊坊", "保定", "沧州", "衡水", "邢台", "邯郸", "山西", "太原", "大同", "朔州", "忻州", "阳泉", "晋中", "吕梁", "长治", "临汾", "晋城", "运城", "内蒙古", "呼和浩特", "呼伦贝尔", "通辽", "赤峰", "巴彦淖尔", "乌兰察布", "包头", "鄂尔多斯", "乌海", "黑龙江", "哈尔滨", "黑河", "伊春", "齐齐哈尔", "鹤岗", "佳木斯", "双鸭山", "绥化", "大庆", "七台河", "鸡西", "牡丹江", "吉林", "长春", "白城", "松原", "吉林", "四平", "辽源", "白山", "通化", "辽宁", "沈阳", "铁岭", "阜新", "抚顺", "朝阳", "本溪", "辽阳", "鞍山", "盘锦", "锦州", "葫芦岛", "营口", "丹东", "大连", "江苏", "南京", "连云港", "徐州", "宿迁", "淮安", "盐城", "泰州", "扬州", "镇江", "南通", "常州", "无锡", "苏州", "浙江", "杭州", "湖州", "嘉兴", "绍兴", "舟山", "宁波", "金华", "衢州", "台州", "丽水", "温州", "安徽", "合肥", "淮北", "亳州", "宿州", "蚌埠", "阜阳", "淮南", "滁州", "六安", "马鞍山", "芜湖", "宣城", "铜陵", "池州", "安庆", "黄山", "福建", "福州", "宁德", "南平", "三明", "莆田", "龙岩", "泉州", "漳州", "厦门", "江西", "南昌", "九江", "景德镇", "上饶", "鹰潭", "抚州", "新余", "宜春", "萍乡", "吉安", "赣州", "山东", "济南", "德州", "滨州", "东营", "烟台", "威海", "淄博", "潍坊", "聊城", "泰安", "莱芜", "青岛", "日照", "济宁", "菏泽", "临沂", "枣庄", "河南", "郑州", "安阳", "鹤壁", "濮阳", "新乡", "焦作", "三门峡", "开封", "洛阳", "商丘", "许昌", "平顶山", "周口", "漯河", "南阳", "驻马店", "信阳", "湖北", "武汉", "十堰", "襄樊", "随州", "荆门", "孝感", "宜昌", "黄冈", "鄂州", "荆州", "黄石", "咸宁", "湖南", "长沙", "岳阳", "张家界", "常德", "益阳", "湘潭", "株洲", "娄底", "怀化", "邵阳", "衡阳", "永州", "郴州", "广东", "广州", "韶关", "梅州", "河源", "清远", "潮州", "揭阳", "汕头", "肇庆", "惠州", "佛山", "东莞", "云浮", "汕尾", "江门", "中山", "深圳", "珠海", "阳江", "茂名", "湛江", "广西", "南宁", "桂林", "河池", "贺州", "柳州", "百色", "来宾", "梧州", "贵港", "玉林", "崇左", "钦州", "防城港", "海", "海南", "海口", "三亚", "三沙", "儋州", "四川", "成都", "广元", "巴中", "绵阳", "德阳", "达州", "南充", "遂宁", "广安", "资阳", "眉山", "雅安", "内江", "乐山", "自贡", "泸州", "宜宾", "攀枝花", "贵州", "贵阳", "遵义", "六盘水", "安顺", "铜仁", "毕节", "云南", "昆明", "昭通", "丽江", "曲靖", "保山", "玉溪", "临沧", "普洱", "西藏", "拉萨", "日喀则", "昌都", "林芝", "山南", "那曲", "陕西", "西安", "榆林", "延安", "铜川", "渭南", "宝鸡", "咸阳", "商洛", "汉中", "安康", "甘肃", "兰州", "嘉峪关", "酒泉", "张掖", "金昌", "武威", "白银", "庆阳", "平凉", "定西", "天水", "陇南", "青海", "西宁", "海东", "宁夏", "银川", "石嘴山", "吴忠", "中卫", "固原", "新疆", "乌鲁木齐", "克拉玛依", "吐鲁番", "哈密"] for city in city_list: if text.startswith(city): company.append(text) break elif len(text) <= 5: pass elif text.isdigit(): pass elif "公司" in text: temp = re.findall("中标单位:*([\u4E00-\u9FA5()]{4,20}?公司)", text) company.extend(temp) else: pass # 去除重复内容 company = '\n'.join(set(company)) return company if __name__ == '__main__': bid_info = BidInfo("2019-09-12") bid_info.parse_data()