deal_bidder.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. @author: Deepcold
  5. @file: deal_bidder.py
  6. @time: 2019/7/4 9:57
  7. """
  8. import re
  9. from bson import ObjectId
  10. from pymongo import MongoClient
  11. class BidInfo(object):
  12. def __init__(self, date):
  13. self.client = MongoClient(['dds-uf6ff5dfd9aef3641601-pub.mongodb.rds.aliyuncs.com:3717',
  14. 'dds-uf6ff5dfd9aef3642555-pub.mongodb.rds.aliyuncs.com:3717'],
  15. replicaSet='mgset-6501997')
  16. # 授权. 这里的user基于admin数据库授权
  17. self.client["itslaw"].authenticate('itslaw', 'itslaw_168')
  18. self.db_new = self.client["itslaw"].WinBidInfo
  19. # self.db_new = self.client["itslaw"].WinBidInfo_copy1
  20. # 获取所有数据的游标对象
  21. self.response = self.db_new.find({"spider_time": {"$gte": date}})
  22. self.new_item = {}
  23. def parse_data(self):
  24. # 遍历游标对象,获取所有数据
  25. for index, content in enumerate(self.response):
  26. print(index)
  27. if "new_bidder" in content.keys():
  28. pass
  29. else:
  30. bidder = content["bidder"]
  31. code = content["code"]
  32. _id = str(content["_id"])
  33. new_bidder = self.parse_bidder(bidder, code)
  34. new_values = {"$set": {"new_bidder": new_bidder}}
  35. self.db_new.update_one(content, new_values)
  36. @staticmethod
  37. def parse_bidder(bidder, code):
  38. # 中文符号换成英文符号
  39. punctuation_en = u',。、!?:;【】()%#@&1234567890'
  40. punctuation_zh = u',..!?:;[]()%#@&1234567890'
  41. punctuation = str.maketrans(punctuation_en, punctuation_zh)
  42. bidder = bidder.translate(punctuation)
  43. # 统一中标人名称
  44. bidder = re.sub(r"中标供应商名称|中标人名称|中标单位名称|中标商名称|中标人|供应商|成交供应商名称|"
  45. "成交人名称|成交人|中标商|中标\(成交\)供应商名称|成交单位|中标企业|中标人为",
  46. "中标单位", bidder)
  47. if "废标" in bidder:
  48. return ""
  49. else:
  50. text_list = re.split(r"[,.!?;\\*—/]", bidder)
  51. company = []
  52. for text in text_list:
  53. if 22 >= len(text) >= 6 and text.endswith("公司"):
  54. if ":" in text:
  55. temp = re.findall("中标单位为*:*([\u4E00-\u9FA5()]{4,20}?公司)", text)
  56. company.extend(temp)
  57. else:
  58. if "中标" in text or "候选" in text:
  59. pass
  60. else:
  61. city_list = ["河北", "石家庄", "张家口", "承德", "唐山", "秦皇岛", "廊坊", "保定", "沧州", "衡水", "邢台", "邯郸",
  62. "山西", "太原", "大同", "朔州", "忻州", "阳泉", "晋中", "吕梁", "长治", "临汾", "晋城", "运城", "内蒙古",
  63. "呼和浩特", "呼伦贝尔", "通辽", "赤峰", "巴彦淖尔", "乌兰察布", "包头", "鄂尔多斯", "乌海", "黑龙江", "哈尔滨",
  64. "黑河", "伊春", "齐齐哈尔", "鹤岗", "佳木斯", "双鸭山", "绥化", "大庆", "七台河", "鸡西", "牡丹江", "吉林",
  65. "长春", "白城", "松原", "吉林", "四平", "辽源", "白山", "通化", "辽宁", "沈阳", "铁岭", "阜新", "抚顺",
  66. "朝阳", "本溪", "辽阳", "鞍山", "盘锦", "锦州", "葫芦岛", "营口", "丹东", "大连", "江苏", "南京", "连云港",
  67. "徐州", "宿迁", "淮安", "盐城", "泰州", "扬州", "镇江", "南通", "常州", "无锡", "苏州", "浙江", "杭州",
  68. "湖州", "嘉兴", "绍兴", "舟山", "宁波", "金华", "衢州", "台州", "丽水", "温州", "安徽", "合肥", "淮北",
  69. "亳州", "宿州", "蚌埠", "阜阳", "淮南", "滁州", "六安", "马鞍山", "芜湖", "宣城", "铜陵", "池州", "安庆",
  70. "黄山", "福建", "福州", "宁德", "南平", "三明", "莆田", "龙岩", "泉州", "漳州", "厦门", "江西", "南昌",
  71. "九江", "景德镇", "上饶", "鹰潭", "抚州", "新余", "宜春", "萍乡", "吉安", "赣州", "山东", "济南", "德州",
  72. "滨州", "东营", "烟台", "威海", "淄博", "潍坊", "聊城", "泰安", "莱芜", "青岛", "日照", "济宁", "菏泽",
  73. "临沂", "枣庄", "河南", "郑州", "安阳", "鹤壁", "濮阳", "新乡", "焦作", "三门峡", "开封", "洛阳", "商丘",
  74. "许昌", "平顶山", "周口", "漯河", "南阳", "驻马店", "信阳", "湖北", "武汉", "十堰", "襄樊", "随州", "荆门",
  75. "孝感", "宜昌", "黄冈", "鄂州", "荆州", "黄石", "咸宁", "湖南", "长沙", "岳阳", "张家界", "常德", "益阳",
  76. "湘潭", "株洲", "娄底", "怀化", "邵阳", "衡阳", "永州", "郴州", "广东", "广州", "韶关", "梅州", "河源",
  77. "清远", "潮州", "揭阳", "汕头", "肇庆", "惠州", "佛山", "东莞", "云浮", "汕尾", "江门", "中山", "深圳",
  78. "珠海", "阳江", "茂名", "湛江", "广西", "南宁", "桂林", "河池", "贺州", "柳州", "百色", "来宾", "梧州",
  79. "贵港", "玉林", "崇左", "钦州", "防城港", "海", "海南", "海口", "三亚", "三沙", "儋州", "四川", "成都",
  80. "广元", "巴中", "绵阳", "德阳", "达州", "南充", "遂宁", "广安", "资阳", "眉山", "雅安", "内江", "乐山",
  81. "自贡", "泸州", "宜宾", "攀枝花", "贵州", "贵阳", "遵义", "六盘水", "安顺", "铜仁", "毕节", "云南", "昆明",
  82. "昭通", "丽江", "曲靖", "保山", "玉溪", "临沧", "普洱", "西藏", "拉萨", "日喀则", "昌都", "林芝", "山南",
  83. "那曲", "陕西", "西安", "榆林", "延安", "铜川", "渭南", "宝鸡", "咸阳", "商洛", "汉中", "安康", "甘肃",
  84. "兰州", "嘉峪关", "酒泉", "张掖", "金昌", "武威", "白银", "庆阳", "平凉", "定西", "天水", "陇南", "青海",
  85. "西宁", "海东", "宁夏", "银川", "石嘴山", "吴忠", "中卫", "固原", "新疆", "乌鲁木齐", "克拉玛依", "吐鲁番",
  86. "哈密"]
  87. for city in city_list:
  88. if text.startswith(city):
  89. company.append(text)
  90. break
  91. elif len(text) <= 5:
  92. pass
  93. elif text.isdigit():
  94. pass
  95. elif "公司" in text:
  96. temp = re.findall("中标单位:*([\u4E00-\u9FA5()]{4,20}?公司)", text)
  97. company.extend(temp)
  98. else:
  99. pass
  100. # 去除重复内容
  101. company = '\n'.join(set(company))
  102. return company
  103. if __name__ == '__main__':
  104. bid_info = BidInfo("2019-09-12")
  105. bid_info.parse_data()