parseData.py 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. @author: Deepcold
  5. @file: parseData.py
  6. @time: 2019/8/14 15:55
  7. """
  8. import datetime
  9. import json
  10. from bin.common.esQuery import es_query_bg_name
  11. from bin.common.parse_name_is_company import parse_name
  12. from bin.utils.json_dump_date import JsonCustomEncoder
  13. def parse_data(db_config, each, push_date, save_items):
  14. table_name = db_config.table_name
  15. query_words = db_config[table_name].QUERY_WORDS # 查询关键字
  16. summary_fields = db_config[table_name].SUMMARY_FIELDS
  17. event_type = db_config.event_type # 事件类型
  18. event_subtype = db_config.event_sub_type # 子分类
  19. event_date = each[query_words["event_date"]] # 事件发生日期
  20. _id = each["_id"]
  21. try:
  22. event_company = each[query_words["event_company"]] # 事件企业
  23. # 解析被告是企业不是个人
  24. if event_company:
  25. event_company_list = parse_name(event_company)
  26. else:
  27. event_company_list = []
  28. except Exception as e:
  29. event_company_list = []
  30. print(e)
  31. source_content = each # 原数据
  32. # 构建摘要信息
  33. summary_information = {}
  34. for word in summary_fields:
  35. summary_information[db_config[table_name].ALL_FIELDS[word]] = each[word]
  36. source_content = json.dumps(source_content, cls=JsonCustomEncoder, ensure_ascii=False)
  37. summary_information = json.dumps(summary_information, cls=JsonCustomEncoder, ensure_ascii=False)
  38. for event_company in event_company_list:
  39. # 判断企业在es中是不是存在
  40. es_content = es_query_bg_name(event_company)
  41. # 如果在es中存在
  42. if es_content:
  43. new_content = []
  44. for temp in es_content:
  45. source = temp["_source"]
  46. yg_name = source["yishen_yg"]
  47. if parse_name(yg_name):
  48. new_content.append(temp)
  49. item = {}
  50. if new_content:
  51. # 封装成字典
  52. item["event_type"] = event_type
  53. item["event_subtype"] = event_subtype
  54. item["event_desc"] = summary_information
  55. item["event_company"] = event_company
  56. item["event_date"] = event_date
  57. item["push_date"] = push_date
  58. item["source_platform"] = table_name
  59. item["source_id"] = _id
  60. item["source_content"] = source_content
  61. item["hasBgCase"] = new_content
  62. # if type(item["event_date"]) is str:
  63. # if item["push_date"] - datetime.datetime.date(
  64. # datetime.datetime.strptime(item["event_date"], '%Y-%m-%d')) <= datetime.timedelta(7):
  65. # # print(item)
  66. # save_items.append(item)
  67. # elif type(item["event_date"]) is datetime.date:
  68. # if item["push_date"] -item["event_date"] <= datetime.timedelta(7):
  69. # save_items.append(item)
  70. print(item)
  71. save_items.append(item)