xgl.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. import json
  2. import re
  3. import time
  4. from RS import Get_Cookie
  5. from Captcha import Init_Captcha
  6. import ddddocr
  7. class Get_Info:
  8. def __init__(self):
  9. self.headers = {
  10. "Connection": "keep-alive",
  11. "Pragma": "no-cache",
  12. "Cache-Control": "no-cache",
  13. "Accept": "application/json, text/javascript, */*; q=0.01",
  14. "X-Requested-With": "XMLHttpRequest",
  15. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36",
  16. "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
  17. "Origin": "http://zxgk.court.gov.cn",
  18. "Referer": "http://zxgk.court.gov.cn/xgl/",
  19. "Accept-Language": "zh-CN,zh;q=0.9"
  20. }
  21. self.Get_Cookie = Get_Cookie()
  22. self.session, self.ctx200 = self.Get_Cookie.main()
  23. self.updateCookie = self.Get_Cookie.update_cookie
  24. self.initURL = "http://zxgk.court.gov.cn/xgl"
  25. self.ListURL = "http://zxgk.court.gov.cn/xgl/searchXgl.do"
  26. self.ocr = ddddocr.DdddOcr()
  27. self.totalPage = 0
  28. self.pdfs = []
  29. self.file = open("xglLink.txt","w",encoding="utf-8")
  30. def init_page(self):
  31. init_page_response = self.session.get(url=self.initURL, headers=self.headers)
  32. print("init_page_response ===>", init_page_response)
  33. # print("init_page_response.text ===>",init_page_response.text)
  34. self.updateCookie()
  35. def init_captcha(self):
  36. self.CaptchaURL, self.captchaId, self.MyRandom = Init_Captcha().main()
  37. Captcha_headers = {
  38. "Connection": "keep-alive",
  39. "Pragma": "no-cache",
  40. "Cache-Control": "no-cache",
  41. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36",
  42. "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
  43. "Referer": "http://zxgk.court.gov.cn/xgl/",
  44. "Accept-Language": "zh-CN,zh;q=0.9"
  45. }
  46. CaptchaResponse = self.session.get(url=self.CaptchaURL,headers=Captcha_headers)
  47. print("请求验证码的响应结果 ===>",CaptchaResponse)
  48. with open("captcha.jpg", 'wb') as f:
  49. f.write(CaptchaResponse.content)
  50. with open("captcha.jpg", 'rb') as f:
  51. image = f.read()
  52. self.pCode = self.ocr.classification(image)
  53. print("验证码的结果 ===>",self.pCode)
  54. self.updateCookie()
  55. def check_Captcha(self):
  56. url = "checkyzm.do?captchaId=" + self.captchaId + "&pCode=" + self.pCode + ""
  57. checkURL = self.ctx200.call("check_yzm","GET",url,self.captchaId,self.pCode)
  58. checkURL = "http://zxgk.court.gov.cn" + checkURL
  59. print("验证码验证的URL ===>",checkURL)
  60. response = self.session.get(url=checkURL,headers=self.headers)
  61. self.updateCookie()
  62. if response.text.split("\n")[0] == "1":
  63. print("验证码验证通过")
  64. return True
  65. else:
  66. print("验证码验证失败,退出程序")
  67. return False
  68. def check_yzm(self):
  69. while True:
  70. self.init_captcha()
  71. ret = self.check_Captcha()
  72. self.updateCookie()
  73. if ret == True:
  74. break
  75. else:
  76. print("验证失败,正在进行重新请求验证码并验证")
  77. print("验证码的验证通过")
  78. def Get_List(self, name, currentPage):
  79. List_data = {
  80. "pName": name,
  81. "pCardNum": "",
  82. "selectCourtId": "0",
  83. "pCode": self.pCode,
  84. "captchaId": self.captchaId,
  85. "searchCourtName": "全国法院(包含地方各级法院)",
  86. "selectCourtArrange": "1",
  87. "currentPage": currentPage
  88. }
  89. url = self.ctx200.call("get_list", "POST", "searchXgl.do")
  90. print("url ===>",url)
  91. url = "http://zxgk.court.gov.cn" + url
  92. ListResponse = self.session.post(url=url,headers=self.headers,data=List_data)
  93. print("列表页的响应 ===>",ListResponse.text)
  94. self.updateCookie()
  95. try:
  96. self.totalPage = re.findall(r'"totalPage":(.*?),"',ListResponse.text)[0]
  97. print("totalPage ===>",self.totalPage)
  98. self.pdfs.append(re.findall(r'"FILEPATH":"(.*?)","', ListResponse.text))
  99. return True
  100. except:
  101. print("获取totalPage失败了 ListResponse ===> ",ListResponse.text)
  102. self.check_yzm()
  103. return False
  104. def write_link(self):
  105. for pdf in self.pdfs:
  106. for i in pdf:
  107. self.file.write(i)
  108. self.file.write("\n")
  109. self.pdfs = []
  110. def main(self):
  111. self.init_page()
  112. self.check_yzm()
  113. i = 1
  114. while True:
  115. print("i =====>",i)
  116. if i == int(self.totalPage):
  117. break
  118. if self.Get_List("张三",i):
  119. i += 1
  120. self.write_link()
  121. print("关键字为 张三 的内容写入完毕")
  122. self.file.close()
  123. if __name__ == '__main__':
  124. Get_Info().main()