|
@@ -0,0 +1,113 @@
|
|
|
+#!/usr/bin/env python
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+"""
|
|
|
+@author: Deepcold
|
|
|
+@file: deal_bidder.py
|
|
|
+@time: 2019/7/4 9:57
|
|
|
+"""
|
|
|
+import re
|
|
|
+from bson import ObjectId
|
|
|
+from pymongo import MongoClient
|
|
|
+
|
|
|
+
|
|
|
+class BidInfo(object):
|
|
|
+ def __init__(self, date):
|
|
|
+ self.client = MongoClient(['dds-uf6ff5dfd9aef3641601-pub.mongodb.rds.aliyuncs.com:3717',
|
|
|
+ 'dds-uf6ff5dfd9aef3642555-pub.mongodb.rds.aliyuncs.com:3717'],
|
|
|
+ replicaSet='mgset-6501997')
|
|
|
+ # 授权. 这里的user基于admin数据库授权
|
|
|
+ self.client["itslaw"].authenticate('itslaw', 'itslaw_168')
|
|
|
+ self.db_new = self.client["itslaw"].WinBidInfo
|
|
|
+ # self.db_new = self.client["itslaw"].WinBidInfo_copy1
|
|
|
+ # 获取所有数据的游标对象
|
|
|
+ self.response = self.db_new.find({"spider_time": {"$gte": date}})
|
|
|
+ self.new_item = {}
|
|
|
+
|
|
|
+ def parse_data(self):
|
|
|
+ # 遍历游标对象,获取所有数据
|
|
|
+ for index, content in enumerate(self.response):
|
|
|
+ print(index)
|
|
|
+ if "new_bidder" in content.keys():
|
|
|
+ pass
|
|
|
+ else:
|
|
|
+ bidder = content["bidder"]
|
|
|
+ code = content["code"]
|
|
|
+ _id = str(content["_id"])
|
|
|
+ new_bidder = self.parse_bidder(bidder, code)
|
|
|
+ new_values = {"$set": {"new_bidder": new_bidder}}
|
|
|
+ self.db_new.update_one(content, new_values)
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def parse_bidder(bidder, code):
|
|
|
+ # 中文符号换成英文符号
|
|
|
+ punctuation_en = u',。、!?:;【】()%#@&1234567890'
|
|
|
+ punctuation_zh = u',..!?:;[]()%#@&1234567890'
|
|
|
+ punctuation = str.maketrans(punctuation_en, punctuation_zh)
|
|
|
+ bidder = bidder.translate(punctuation)
|
|
|
+
|
|
|
+ # 统一中标人名称
|
|
|
+ bidder = re.sub(r"中标供应商名称|中标人名称|中标单位名称|中标商名称|中标人|供应商|成交供应商名称|"
|
|
|
+ "成交人名称|成交人|中标商|中标\(成交\)供应商名称|成交单位|中标企业|中标人为",
|
|
|
+ "中标单位", bidder)
|
|
|
+
|
|
|
+ if "废标" in bidder:
|
|
|
+ return ""
|
|
|
+ else:
|
|
|
+ text_list = re.split(r"[,.!?;\\*—/]", bidder)
|
|
|
+ company = []
|
|
|
+ for text in text_list:
|
|
|
+ if 22 >= len(text) >= 6 and text.endswith("公司"):
|
|
|
+ if ":" in text:
|
|
|
+ temp = re.findall("中标单位为*:*([\u4E00-\u9FA5()]{4,20}?公司)", text)
|
|
|
+ company.extend(temp)
|
|
|
+ else:
|
|
|
+ if "中标" in text or "候选" in text:
|
|
|
+ pass
|
|
|
+ else:
|
|
|
+ city_list = ["河北", "石家庄", "张家口", "承德", "唐山", "秦皇岛", "廊坊", "保定", "沧州", "衡水", "邢台", "邯郸",
|
|
|
+ "山西", "太原", "大同", "朔州", "忻州", "阳泉", "晋中", "吕梁", "长治", "临汾", "晋城", "运城", "内蒙古",
|
|
|
+ "呼和浩特", "呼伦贝尔", "通辽", "赤峰", "巴彦淖尔", "乌兰察布", "包头", "鄂尔多斯", "乌海", "黑龙江", "哈尔滨",
|
|
|
+ "黑河", "伊春", "齐齐哈尔", "鹤岗", "佳木斯", "双鸭山", "绥化", "大庆", "七台河", "鸡西", "牡丹江", "吉林",
|
|
|
+ "长春", "白城", "松原", "吉林", "四平", "辽源", "白山", "通化", "辽宁", "沈阳", "铁岭", "阜新", "抚顺",
|
|
|
+ "朝阳", "本溪", "辽阳", "鞍山", "盘锦", "锦州", "葫芦岛", "营口", "丹东", "大连", "江苏", "南京", "连云港",
|
|
|
+ "徐州", "宿迁", "淮安", "盐城", "泰州", "扬州", "镇江", "南通", "常州", "无锡", "苏州", "浙江", "杭州",
|
|
|
+ "湖州", "嘉兴", "绍兴", "舟山", "宁波", "金华", "衢州", "台州", "丽水", "温州", "安徽", "合肥", "淮北",
|
|
|
+ "亳州", "宿州", "蚌埠", "阜阳", "淮南", "滁州", "六安", "马鞍山", "芜湖", "宣城", "铜陵", "池州", "安庆",
|
|
|
+ "黄山", "福建", "福州", "宁德", "南平", "三明", "莆田", "龙岩", "泉州", "漳州", "厦门", "江西", "南昌",
|
|
|
+ "九江", "景德镇", "上饶", "鹰潭", "抚州", "新余", "宜春", "萍乡", "吉安", "赣州", "山东", "济南", "德州",
|
|
|
+ "滨州", "东营", "烟台", "威海", "淄博", "潍坊", "聊城", "泰安", "莱芜", "青岛", "日照", "济宁", "菏泽",
|
|
|
+ "临沂", "枣庄", "河南", "郑州", "安阳", "鹤壁", "濮阳", "新乡", "焦作", "三门峡", "开封", "洛阳", "商丘",
|
|
|
+ "许昌", "平顶山", "周口", "漯河", "南阳", "驻马店", "信阳", "湖北", "武汉", "十堰", "襄樊", "随州", "荆门",
|
|
|
+ "孝感", "宜昌", "黄冈", "鄂州", "荆州", "黄石", "咸宁", "湖南", "长沙", "岳阳", "张家界", "常德", "益阳",
|
|
|
+ "湘潭", "株洲", "娄底", "怀化", "邵阳", "衡阳", "永州", "郴州", "广东", "广州", "韶关", "梅州", "河源",
|
|
|
+ "清远", "潮州", "揭阳", "汕头", "肇庆", "惠州", "佛山", "东莞", "云浮", "汕尾", "江门", "中山", "深圳",
|
|
|
+ "珠海", "阳江", "茂名", "湛江", "广西", "南宁", "桂林", "河池", "贺州", "柳州", "百色", "来宾", "梧州",
|
|
|
+ "贵港", "玉林", "崇左", "钦州", "防城港", "海", "海南", "海口", "三亚", "三沙", "儋州", "四川", "成都",
|
|
|
+ "广元", "巴中", "绵阳", "德阳", "达州", "南充", "遂宁", "广安", "资阳", "眉山", "雅安", "内江", "乐山",
|
|
|
+ "自贡", "泸州", "宜宾", "攀枝花", "贵州", "贵阳", "遵义", "六盘水", "安顺", "铜仁", "毕节", "云南", "昆明",
|
|
|
+ "昭通", "丽江", "曲靖", "保山", "玉溪", "临沧", "普洱", "西藏", "拉萨", "日喀则", "昌都", "林芝", "山南",
|
|
|
+ "那曲", "陕西", "西安", "榆林", "延安", "铜川", "渭南", "宝鸡", "咸阳", "商洛", "汉中", "安康", "甘肃",
|
|
|
+ "兰州", "嘉峪关", "酒泉", "张掖", "金昌", "武威", "白银", "庆阳", "平凉", "定西", "天水", "陇南", "青海",
|
|
|
+ "西宁", "海东", "宁夏", "银川", "石嘴山", "吴忠", "中卫", "固原", "新疆", "乌鲁木齐", "克拉玛依", "吐鲁番",
|
|
|
+ "哈密"]
|
|
|
+ for city in city_list:
|
|
|
+ if text.startswith(city):
|
|
|
+ company.append(text)
|
|
|
+ break
|
|
|
+ elif len(text) <= 5:
|
|
|
+ pass
|
|
|
+ elif text.isdigit():
|
|
|
+ pass
|
|
|
+ elif "公司" in text:
|
|
|
+ temp = re.findall("中标单位:*([\u4E00-\u9FA5()]{4,20}?公司)", text)
|
|
|
+ company.extend(temp)
|
|
|
+ else:
|
|
|
+ pass
|
|
|
+ # 去除重复内容
|
|
|
+ company = '\n'.join(set(company))
|
|
|
+ return company
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ bid_info = BidInfo("2019-09-12")
|
|
|
+ bid_info.parse_data()
|