Browse Source

feat: 添加dockerfile等镜像配置

许家凯 1 year ago
parent
commit
31aa86fe91

+ 20 - 0
.drone.yml

@@ -0,0 +1,20 @@
+kind: pipeline
+type: kubernetes
+name: deploy
+trigger:
+  ref:
+    - refs/heads/master
+steps:
+  - name: build
+    image: registry-vpc.cn-shanghai.aliyuncs.com/winhc-spider/drone-kaniko
+    settings:
+      registry_mirrors: 9s9ylon1.mirror.aliyuncs.com
+      registry: registry-vpc.cn-shanghai.aliyuncs.com
+      skip_tls_verify: true
+      enable_cache: true
+      cache_repo: 192.168.1.7:5000/winhc-spider/data-clean
+      username:
+        from_secret: username
+      password:
+        from_secret: password
+      repo: registry-vpc.cn-shanghai.aliyuncs.com/winhc-spider/data-clean

+ 2 - 0
.gitignore

@@ -1,3 +1,5 @@
+tests
+
 ### Python template
 ### Python template
 # Byte-compiled / optimized / DLL files
 # Byte-compiled / optimized / DLL files
 __pycache__/
 __pycache__/

+ 22 - 0
Dockerfile

@@ -0,0 +1,22 @@
+FROM sunpeek/poetry:py3.11-slim as  requirements
+WORKDIR /src
+COPY ./pyproject.toml ./
+COPY ./poetry.lock ./
+RUN poetry export -f requirements.txt --without-hashes -o /src/requirements.txt
+
+FROM python:3.11 as builder
+
+# ... yourself commands
+
+COPY --from=requirements /src/requirements.txt .
+RUN pip install  --user -r requirements.txt -i http://mirrors.cloud.aliyuncs.com/pypi/simple/ --trusted-host mirrors.cloud.aliyuncs.com
+
+FROM python:3.11-slim-bullseye
+COPY --from=builder /root/.local /root/.local
+COPY . /app/data-clean
+# update PATH environment variable
+ENV PATH=/root/.local/bin:/root/.local:$PATH
+WORKDIR /app
+CMD ["python","-m","JobMain"]
+
+# ... yourself commands

+ 58 - 9
JobMain.py

@@ -6,29 +6,77 @@
 import asyncio
 import asyncio
 import json
 import json
 
 
+import aio_pika
+from aio_pika import IncomingMessage
 from environs import Env
 from environs import Env
 
 
 from data_clean.task_distributor import task_distribute
 from data_clean.task_distributor import task_distribute
-from data_clean.utils.async_client import get_aio_kafka_consumer, get_aio_kafka_producer
+from data_clean.utils.async_client import get_aio_kafka_consumer, get_aio_kafka_producer, get_rabbitmq_connection
 from data_clean.utils.asyncio_pool import AsyncPool
 from data_clean.utils.asyncio_pool import AsyncPool
 
 
-source_topic = "source_topic"
-target_topic = "target_topic"
-
 env = Env()
 env = Env()
+
+base_topic = env.str("base_topic", "rt_other_dim")
+
+source_topic = env.str("source_topic", base_topic)  # "rt_company_dim"
+target_topic = env.str("target_topic", base_topic)  # "rt_company_dim"
+
 max_concurrency = env.int("concurrency", 20)
 max_concurrency = env.int("concurrency", 20)
 
 
 
 
-async def handle(producer, data):
+async def handle(producer, data: dict):
     result = await task_distribute(data)
     result = await task_distribute(data)
     print("send : ", result)
     print("send : ", result)
+
     if result is not None:
     if result is not None:
         await producer.send_and_wait(target_topic, json.dumps(result).encode())
         await producer.send_and_wait(target_topic, json.dumps(result).encode())
         pass
         pass
     pass
     pass
 
 
 
 
-async def main():
+async def on_message_received(producer, msg: IncomingMessage):
+    data: dict = json.loads(msg.body)
+    await handle(producer, data)
+    await msg.ack()
+    pass
+
+
+async def main_for_rabbitmq():
+    print("start job. Listening queue :", source_topic, "send topic:", target_topic, "max concurrency:",
+          max_concurrency)
+
+    pool = AsyncPool(max_concurrency)
+
+    producer = get_aio_kafka_producer()
+    await producer.start()
+
+    queue_name = source_topic  # 只需要配置这个
+
+    connection = await get_rabbitmq_connection()
+    async with connection:
+        channel: aio_pika.abc.AbstractChannel = await connection.channel()
+        await channel.set_qos(prefetch_count=max_concurrency)
+
+        # Declaring queue
+        queue: aio_pika.abc.AbstractQueue = await channel.declare_queue(
+            name=queue_name,
+            durable=True,
+            auto_delete=False
+        )
+
+        async with queue.iterator(no_ack=False) as queue_iter:
+            # Cancel consuming after __aexit__
+            async for message in queue_iter:
+                message: IncomingMessage = message
+                await pool.create_task(on_message_received(producer, message))
+                # async with message.process(ignore_processed=True):
+                #     await pool.create_task(on_message_received(producer, message))
+                #     pass
+        pass
+    pass
+
+
+async def main_for_kafka():
     pool = AsyncPool(max_concurrency)
     pool = AsyncPool(max_concurrency)
     consumer = get_aio_kafka_consumer(source_topic)
     consumer = get_aio_kafka_consumer(source_topic)
     producer = get_aio_kafka_producer()
     producer = get_aio_kafka_producer()
@@ -37,10 +85,11 @@ async def main():
     try:
     try:
         # Consume messages
         # Consume messages
         async for msg in consumer:
         async for msg in consumer:
-            # print("consumed: ", msg.topic, msg.partition, msg.offset,
-            #       msg.key, msg.value, msg.timestamp)
+            print("consumed: ", msg.topic, msg.partition, msg.offset,
+                  msg.key, msg.value, msg.timestamp)
             data: dict = json.loads(msg.value)
             data: dict = json.loads(msg.value)
             await pool.create_task(handle(producer, data))
             await pool.create_task(handle(producer, data))
+
     finally:
     finally:
         # Will leave consumer group; perform autocommit if enabled.
         # Will leave consumer group; perform autocommit if enabled.
         await consumer.stop()
         await consumer.stop()
@@ -49,5 +98,5 @@ async def main():
 
 
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
-    asyncio.run(main())
+    asyncio.run(main_for_rabbitmq())
     pass
     pass

+ 5 - 0
data_clean/env/env-dev.yaml

@@ -72,3 +72,8 @@ holo:
     username: LTAI4G4yiyJV4ggnLyGMduqV
     username: LTAI4G4yiyJV4ggnLyGMduqV
     pwd: nokDg5HlVIBh80nL2dOXsKa2La4XL5
     pwd: nokDg5HlVIBh80nL2dOXsKa2La4XL5
 
 
+rabbit_mq:
+  base:
+    host: 106.15.78.184
+    username: whc
+    password: whc

+ 5 - 0
data_clean/env/env-prod.yaml

@@ -45,3 +45,8 @@ holo:
     username: LTAI4G4yiyJV4ggnLyGMduqV
     username: LTAI4G4yiyJV4ggnLyGMduqV
     pwd: nokDg5HlVIBh80nL2dOXsKa2La4XL5
     pwd: nokDg5HlVIBh80nL2dOXsKa2La4XL5
 
 
+rabbit_mq:
+  base:
+    host: 192.168.2.63
+    username: whc
+    password: whc

+ 4 - 0
data_clean/exception/exception_handle.py

@@ -9,6 +9,7 @@ from data_clean.api.mongo_api import insert_one
 from data_clean.exception.fetch_exception import FetchException
 from data_clean.exception.fetch_exception import FetchException
 from data_clean.exception.ruler_validation_exception import RulerValidationException
 from data_clean.exception.ruler_validation_exception import RulerValidationException
 from data_clean.utils import get_log
 from data_clean.utils import get_log
+from data_clean.utils.date_utils import get_now_datetime
 
 
 log = get_log("exception_handler")
 log = get_log("exception_handler")
 
 
@@ -26,6 +27,7 @@ async def ruler_valid_exception_sink(ex: RulerValidationException, tn: str, data
         "ruler_code": ex.ruler_code,
         "ruler_code": ex.ruler_code,
         "tn": tn,
         "tn": tn,
         "exception": str(ex),
         "exception": str(ex),
+        "create_time": get_now_datetime(),
         "data": data
         "data": data
     }
     }
     await insert_one(col, doc)
     await insert_one(col, doc)
@@ -45,6 +47,7 @@ async def fetch_exception_sink(ex: FetchException, tn: str, data: list):
     doc = {
     doc = {
         "tn": tn,
         "tn": tn,
         "data": data,
         "data": data,
+        "create_time": get_now_datetime(),
         "exception": str(ex),
         "exception": str(ex),
     }
     }
     await insert_one(col_pre, doc)
     await insert_one(col_pre, doc)
@@ -62,6 +65,7 @@ async def error_sink(ex: Exception, tn: str, data: list):
     col_pre = f"a_data_clean_error"
     col_pre = f"a_data_clean_error"
     doc = {
     doc = {
         "tn": tn,
         "tn": tn,
+        "create_time": get_now_datetime(),
         "exception": repr(ex),
         "exception": repr(ex),
         "data": data
         "data": data
     }
     }

+ 7 - 2
data_clean/handle/company_court_open_announcement.py

@@ -15,7 +15,7 @@ from data_clean.utils.str_utils import json_str_2_list
 dim_handle = get_dim_handle(os.path.basename(__file__))
 dim_handle = get_dim_handle(os.path.basename(__file__))
 
 
 
 
-@dim_handle.registry_prefix_func
+# @dim_handle.registry_prefix_func
 async def prefix_func(dim_data: list):
 async def prefix_func(dim_data: list):
     print("前置程序:", dim_data)
     print("前置程序:", dim_data)
     # raise ValueError("前置程序错误")
     # raise ValueError("前置程序错误")
@@ -25,7 +25,12 @@ async def prefix_func(dim_data: list):
 
 
 @dim_handle.registry_postfix_func()
 @dim_handle.registry_postfix_func()
 async def post_func(dim_data: list):
 async def post_func(dim_data: list):
-    print("后置程序")
+    print("后置程序:", dim_data)
+
+    for r in dim_data:
+        r['update_time'] = get_update_time()
+        pass
+
     pass
     pass
 
 
 
 

+ 3 - 1
data_clean/task_distributor.py

@@ -35,7 +35,9 @@ async def task_distribute(data: dict):
                 tmp_data[key] = result_data
                 tmp_data[key] = result_data
 
 
         else:
         else:
-            raise ValueError(f"{key} 维度未实现!")
+            # raise ValueError(f"{key} 维度未实现!")
+            print(f'{key}维度未实现!直接发送...')
+            pass
 
 
     if len(tmp_data) == 0:
     if len(tmp_data) == 0:
         return None
         return None

+ 21 - 0
data_clean/utils/async_client.py

@@ -3,6 +3,10 @@
 # @Author : XuJiakai
 # @Author : XuJiakai
 # @File : async_client
 # @File : async_client
 # @Software: PyCharm
 # @Software: PyCharm
+import asyncio
+
+import aio_pika
+from aio_pika.abc import AbstractRobustConnection
 from aiokafka import AIOKafkaConsumer
 from aiokafka import AIOKafkaConsumer
 from aiokafka import AIOKafkaProducer
 from aiokafka import AIOKafkaProducer
 from motor.motor_asyncio import AsyncIOMotorClient
 from motor.motor_asyncio import AsyncIOMotorClient
@@ -36,5 +40,22 @@ def get_aio_kafka_producer(name='base'):
     pass
     pass
 
 
 
 
+async def get_rabbitmq_connection(name: str = "base") -> AbstractRobustConnection:
+    host = _env.get_val('rabbit_mq.' + name + '.host')
+    username = _env.get_val('rabbit_mq.' + name + '.username')
+    password = _env.get_val('rabbit_mq.' + name + '.password')
+    port = 32675
+    virtual_host = '/'
+
+    loop = asyncio.get_event_loop()
+    return await aio_pika.connect_robust(
+        host=host, password=password, virtualhost=virtual_host
+        , port=port
+        , login=username
+        , loop=loop
+    )
+    pass
+
+
 if __name__ == '__main__':
 if __name__ == '__main__':
     pass
     pass

+ 100 - 0
data_clean/utils/asyncio_pool.py

@@ -4,6 +4,9 @@
 # @File : async_pool
 # @File : async_pool
 # @Software: PyCharm
 # @Software: PyCharm
 import asyncio
 import asyncio
+import functools
+import signal
+import sys
 from typing import Coroutine
 from typing import Coroutine
 
 
 
 
@@ -18,5 +21,102 @@ class AsyncPool(object):
         return task
         return task
 
 
 
 
+class GracefulExit(SystemExit):
+    code = 1
+
+
+class AsyncPoolListenShut:
+    def __init__(self, max_concurrency: int):
+        self._semaphore: asyncio.Semaphore = asyncio.Semaphore(max_concurrency)
+        self._data: dict = {}
+        self._look = False
+        self._is_windows = sys.platform == 'win32'
+        self._register_shutdown_by_signal()
+        self._current_data = None
+
+    def _register_shutdown_by_signal(self):
+        print("注册shutdown")
+        signal.signal(signal.SIGINT, functools.partial(self.listen_shutdown))
+        signal.signal(signal.SIGTERM, functools.partial(self.listen_shutdown))
+        pass
+
+    async def create_task(self, coro: Coroutine, data) -> asyncio.Task:
+        self._current_data = data
+        if self._look:
+            print("停止消费...")
+            await asyncio.sleep(10000)
+            pass
+        await self._semaphore.acquire()
+        task: asyncio.Task = asyncio.create_task(coro)
+        print('创建task,id: ', id(task))
+        self._data[task] = data
+        task.add_done_callback(lambda t: self._release(t))
+
+        return task
+        pass
+
+    def _release(self, t):
+        print("释放task,id:", id(t))
+        del self._data[t]
+        self._semaphore.release()
+
+    async def _shutdown(self):
+        print("检测到停止信号...")
+        # await self._look.acquire()
+        count = len(self._data)
+        print('当前数量%s' % count)
+        num = 0
+        while count > 0:
+            num += 1
+
+            print("\n第%s" % num)
+            for t in self._data.keys():
+                print(id(t), "是否结束", t.done())
+                count -= 1 if t.done() else 0
+            await asyncio.sleep(3)
+
+        await asyncio.sleep(3)
+        print("所有任务已经结束!")
+
+    def listen_shutdown(self, *args, **kwargs):
+        all_data = list(self._data.values()) + [self._current_data]
+        print("all_data: ", all_data)
+        self._look = True
+        print("所有任务已经结束!")
+        loop = asyncio.get_running_loop()
+        tasks = asyncio.tasks.all_tasks(loop)
+        for t in tasks:
+            t.cancel()
+
+        loop.stop()
+        # raise GracefulExit()
+        pass
+
+    pass
+
+
+async def run1(tt=None):
+    print("sleeping")
+    await asyncio.sleep(3)
+    print("slept")
+    if tt:
+        print(tt)
+    pass
+
+
+async def callback(msg):
+    print("callback")
+    await asyncio.sleep(3)
+    pass
+
+
+async def main():
+    pool = AsyncPoolTest(5, callback)
+    for i in range(10):
+        await pool.create_task(run1(), str(i))
+    pass
+
+
 if __name__ == '__main__':
 if __name__ == '__main__':
+    asyncio.run(main())
     pass
     pass

+ 12 - 1
data_clean/utils/date_utils.py

@@ -6,13 +6,24 @@
 
 
 from datetime import datetime
 from datetime import datetime
 
 
+establish_state_time = datetime(year=1949, month=10, day=1)
+
+
+def get_update_time():
+    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    pass
+
 
 
 def str_2_date_time(date_str, format="%Y-%m-%d %H:%M:%S"):
 def str_2_date_time(date_str, format="%Y-%m-%d %H:%M:%S"):
     return datetime.strptime(date_str, format)
     return datetime.strptime(date_str, format)
     pass
     pass
 
 
 
 
-establish_state_time = datetime(year=1949, month=10, day=1)
+def get_now_datetime():
+    return datetime.now()
+    pass
+
 
 
 if __name__ == '__main__':
 if __name__ == '__main__':
+    print(get_update_time())
     pass
     pass

+ 51 - 1
poetry.lock

@@ -1,6 +1,21 @@
 # This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
 # This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
 
 
 [[package]]
 [[package]]
+name = "aio-pika"
+version = "9.1.5"
+description = "Wrapper around the aiormq for asyncio and humans"
+optional = false
+python-versions = ">=3.7,<4.0"
+files = [
+    {file = "aio_pika-9.1.5-py3-none-any.whl", hash = "sha256:11c3ef2294dd340224c0eafa56c36424b21094a920f19b736080a0f4f4dad863"},
+    {file = "aio_pika-9.1.5.tar.gz", hash = "sha256:285bb9f8d156834c82fafe6375b6957d0ea9e9d823c12b844406ee71de9d7a9f"},
+]
+
+[package.dependencies]
+aiormq = ">=6.7.5,<6.8.0"
+yarl = "*"
+
+[[package]]
 name = "aiohttp"
 name = "aiohttp"
 version = "3.8.5"
 version = "3.8.5"
 description = "Async http client/server framework (asyncio)"
 description = "Async http client/server framework (asyncio)"
@@ -155,6 +170,21 @@ snappy = ["python-snappy (>=0.5)"]
 zstd = ["zstandard"]
 zstd = ["zstandard"]
 
 
 [[package]]
 [[package]]
+name = "aiormq"
+version = "6.7.7"
+description = "Pure python AMQP asynchronous client library"
+optional = false
+python-versions = ">=3.7,<4.0"
+files = [
+    {file = "aiormq-6.7.7-py3-none-any.whl", hash = "sha256:f5efbfcd7d703f3c05c08d4e74cfaa66ca7199840e2969d75ad41b0810026b0a"},
+    {file = "aiormq-6.7.7.tar.gz", hash = "sha256:3b93f612f56989b2757a9a7b299dd94dd3227ce28ba43e81d5fbcded6341dfab"},
+]
+
+[package.dependencies]
+pamqp = "3.2.1"
+yarl = "*"
+
+[[package]]
 name = "aiosignal"
 name = "aiosignal"
 version = "1.3.1"
 version = "1.3.1"
 description = "aiosignal: a list of registered asynchronous callbacks"
 description = "aiosignal: a list of registered asynchronous callbacks"
@@ -686,6 +716,26 @@ url = "https://pypi.doubanio.com/simple"
 reference = "douban"
 reference = "douban"
 
 
 [[package]]
 [[package]]
+name = "pamqp"
+version = "3.2.1"
+description = "RabbitMQ Focused AMQP low-level library"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "pamqp-3.2.1-py2.py3-none-any.whl", hash = "sha256:15acef752356593ca569d13dfedc8ada9f17deeeb8cec4f7b77825e2b6c7de3e"},
+    {file = "pamqp-3.2.1.tar.gz", hash = "sha256:22550ceb1ca50aafda65873e77e8c1c1b139fb5975e1a09860fae940cf8e970a"},
+]
+
+[package.extras]
+codegen = ["lxml", "requests", "yapf"]
+testing = ["coverage", "flake8", "flake8-comprehensions", "flake8-deprecated", "flake8-import-order", "flake8-print", "flake8-quotes", "flake8-rst-docstrings", "flake8-tuple", "yapf"]
+
+[package.source]
+type = "legacy"
+url = "https://pypi.doubanio.com/simple"
+reference = "douban"
+
+[[package]]
 name = "pymongo"
 name = "pymongo"
 version = "4.4.1"
 version = "4.4.1"
 description = "Python driver for MongoDB <http://www.mongodb.org>"
 description = "Python driver for MongoDB <http://www.mongodb.org>"
@@ -958,4 +1008,4 @@ reference = "douban"
 [metadata]
 [metadata]
 lock-version = "2.0"
 lock-version = "2.0"
 python-versions = "^3.8"
 python-versions = "^3.8"
-content-hash = "444555b24d16dff3475f6bf40c4687aadefe20484f489cecb6443381966f6ef3"
+content-hash = "c9162f03076888e29683033e980e3b021a59c8684990b7afa00334b2c3beab84"

+ 1 - 0
pyproject.toml

@@ -13,6 +13,7 @@ aiohttp = "^3.8.5"
 motor = "^3.2.0"
 motor = "^3.2.0"
 pyyaml = "^6.0.1"
 pyyaml = "^6.0.1"
 environs = "^9.5.0"
 environs = "^9.5.0"
+aio-pika = "^9.1.5"
 
 
 
 
 [build-system]
 [build-system]