这个项目的 2 个 docker 容器是一个运行 Scrapy 项目的 Python 镜像和一个 Postgres 镜像。
docker-compose.yml
version: '3.8'
services:
app:
container_name: app
build:
context: ./app
dockerfile: dockerfile
environment:
- POSTGRES_USER=${POSTGRES_USER}
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
- POSTGRES_DB=${POSTGRES_DB}
- POSTGRES_HOST=${POSTGRES_HOST}
- POSTGRES_PORT=${POSTGRES_PORT}
- MAILTO=${MAILTO}
depends_on:
- db
db:
container_name: db
build:
context: ./db
dockerfile: dockerfile
args:
POSTGRES_USER: ${POSTGRES_USER}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
POSTGRES_DB: ${POSTGRES_DB}
ports:
- "${POSTGRES_PORT}:${POSTGRES_PORT}"
admin:
container_name: admin
image: dpage/pgadmin4
environment:
- PGADMIN_DEFAULT_EMAIL=${PGADMIN_DEFAULT_EMAIL}
- PGADMIN_DEFAULT_PASSWORD=${PGADMIN_DEFAULT_PASSWORD}
ports:
- "8888:80"
depends_on:
- db
visualizer:
container_name: visualizer
image: grafana/grafana
ports:
- "3000:3000"
depends_on:
- db
应用程序的 dockerfile
FROM python:3.10-bookworm
RUN apt-get update -q
RUN apt-get install -y cron
COPY . .
RUN pip3 install -r requirements.txt
COPY shell_scripts/scrape_cron /etc/cron.d/scrape_cron
RUN chmod 0744 /etc/cron.d/scrape_cron
RUN crontab /etc/cron.d/scrape_cron
RUN touch /var/log/cron.log
CMD cron && tail -f /var/log/cron.log
数据库的 dockerfile
FROM postgres:15.0
USER postgres
ARG POSTGRES_USER
ARG POSTGRES_PASSWORD
ARG POSTGRES_DB
ENV POSTGRES_USER=$POSTGRES_USER
ENV POSTGRES_PASSWORD=$POSTGRES_PASSWORD
ENV POSTGRES_DB=$POSTGRES_DB
RUN pg_createcluster 15 main && \
/etc/init.d/postgresql start && \
psql --command "CREATE ROLE $POSTGRES_USER WITH SUPERUSER PASSWORD '$POSTGRES_PASSWORD';" && \
createdb -O $POSTGRES_USER $POSTGRES_DB
EXPOSE 5432
CMD ["postgres"]
应用程序容器中的 Scrapy 项目通过标准 psycopg 连接连接到 db 容器中的数据库。
管道.py
hostname = os.environ.get('POSTGRES_HOST', "Hostname not found")
username = os.environ.get('POSTGRES_USER', "Username not found")
password = os.environ.get('POSTGRES_PASSWORD', "Password not found")
database = os.environ.get('POSTGRES_DB', "Database name not found")
port = os.environ.get('POSTGRES_PORT', "Port not found")
logging.debug("Connecting to database...")
try:
self.connection = psycopg.connect(host=hostname, user=username, password=password, dbname=database, port=port)
self.cursor = self.connection.cursor()
logging.info("Connected to database.")
except:
logging.error("Could not connect to database.")
raise
问题出在我为了使项目自动化而实施的 crontab 中。
cron
30 5 * * 0 sh /shell_scripts/scrape.sh
scrape.sh
#!bin/bash
export PATH=$PATH:/usr/local/bin
export POSTGRES_USER=$POSTGRES_USER
export POSTGRES_PASSWORD=$POSTGRES_PASSW
export POSTGRES_DB=$POSTGRES_DB
export POSTGRES_HOST=$POSTGRES_HOST
export POSTGRES_PORT=$POSTGRES_PORT
cd "/scrape"
scrapy crawl spider
我花了一段时间才把玉米搞到这一步。然而,当它激活时,shell脚本执行成功,但我的Scrapy程序无法建立数据库连接,并显示以下消息:
CRITICAL:
Traceback (most recent call last):
File "/usr/local/lib/python3.10/site-packages/twisted/internet/defer.py", line 1697, in _inlineCallbacks
result = context.run(gen.send, result)
File "/usr/local/lib/python3.10/site-packages/scrapy/crawler.py", line 134, in crawl
self.engine = self._create_engine()
File "/usr/local/lib/python3.10/site-packages/scrapy/crawler.py", line 148, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "/usr/local/lib/python3.10/site-packages/scrapy/core/engine.py", line 99, in __init__
self.scraper = Scraper(crawler)
File "/usr/local/lib/python3.10/site-packages/scrapy/core/scraper.py", line 109, in __init__
self.itemproc: ItemPipelineManager = itemproc_cls.from_crawler(crawler)
File "/usr/local/lib/python3.10/site-packages/scrapy/middleware.py", line 67, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "/usr/local/lib/python3.10/site-packages/scrapy/middleware.py", line 44, in from_settings
mw = create_instance(mwcls, settings, crawler)
File "/usr/local/lib/python3.10/site-packages/scrapy/utils/misc.py", line 194, in create_instance
instance = objcls(*args, **kwargs)
File "/scrape/scrape/pipelines.py", line 26, in __init__
self.connection = psycopg.connect(host=hostname, user=username, password=password, dbname=database, port=port)
File "/usr/local/lib/python3.10/site-packages/psycopg/connection.py", line 738, in connect
raise ex.with_traceback(None)
psycopg.OperationalError: connection is bad: No such file or directory
Is the server running locally and accepting connections on that socket?
我相信这个问题是由 crontab 使用不同的 shell 来处理作业引起的。 当我通过终端手动运行 shell 脚本时,一切都运行成功。 似乎由于 crontab shell,shell 或程序不再识别连接服务的 Docker 网络,并认为它正在寻找本地的东西。
我不知道如何解决这个问题。我知道在 Docker 中使用 Cron 很棘手,但这有点噩梦。
环境变量的值:
终于解决了问题。由于 cron 在自己的环境中运行,我必须通过以下更改以构建参数的形式将环境变量的值直接传递到 shell 脚本中:
docker-compose.yml
version: '3.8'
services:
app:
container_name: app
build:
context: ./app
dockerfile: dockerfile
args:
- POSTGRES_USER=${POSTGRES_USER}
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
- POSTGRES_DB=${POSTGRES_DB}
- POSTGRES_HOST=${POSTGRES_HOST}
- POSTGRES_PORT=${POSTGRES_PORT}
environment:
- POSTGRES_USER=${POSTGRES_USER}
- POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
- POSTGRES_DB=${POSTGRES_DB}
- POSTGRES_HOST=${POSTGRES_HOST}
- POSTGRES_PORT=${POSTGRES_PORT}
depends_on:
- db
links:
- db
应用程序的 dockerfile
FROM python:3.10-bookworm
RUN apt-get update -q
RUN apt-get install -y cron
COPY . .
ARG POSTGRES_USER
ARG POSTGRES_PASSWORD
ARG POSTGRES_DB
ARG POSTGRES_HOST
ARG POSTGRES_PORT
RUN sed -i "s/\$POSTGRES_USER/${POSTGRES_USER}/g" shell_scripts/scrape.sh
RUN sed -i "s/\$POSTGRES_PASSWORD/${POSTGRES_PASSWORD}/g" shell_scripts/scrape.sh
RUN sed -i "s/\$POSTGRES_DB/${POSTGRES_DB}/g" shell_scripts/scrape.sh
RUN sed -i "s/\$POSTGRES_HOST/${POSTGRES_HOST}/g" shell_scripts/scrape.sh
RUN sed -i "s/\$POSTGRES_PORT/${POSTGRES_PORT}/g" shell_scripts/scrape.sh
RUN pip3 install -r requirements.txt
COPY shell_scripts/scrape_cron /etc/cron.d/scrape_cron
RUN chmod 0744 /etc/cron.d/scrape_cron
RUN crontab /etc/cron.d/scrape_cron
RUN touch /var/log/cron.log
CMD cron && tail -f /var/log/cron.log