查询 athena aws 的正确方法

问题描述 投票:0回答:1

我用 athena 查询 https://commoncrawl.org/overview 数据超时...如果成功,每次查询将花费我 1000 美元...每 TB 5 美元,200 TB(?) ...其实太多了

这就是我所做的:

CREATE DATABASE CommonData20102024;



CREATE EXTERNAL TABLE IF NOT EXISTS CommonData20102024.commoncrawl_warc (
  WARC_Type           STRING,
  WARC_Date           STRING,
  WARC_Record_ID      STRING,
  Content_Length      INT,
  WARC_Concurrent_To  STRING,
  Content_Type        STRING,
  WARC_Block_Digest   STRING,
  WARC_Payload_Digest STRING,
  WARC_IP_Address     STRING,
  WARC_Refers_To      STRING,
  WARC_Target_URI     STRING,
  WARC_Truncated      STRING,
  WARC_Warcinfo_ID    STRING,
  WARC_Filename       STRING,
  WARC_Profile        STRING,
  WARC_Identified_Payload_Type STRING,
  Payload             STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
  "input.regex" = "..."
)
LOCATION 's3://commoncrawl/crawl-data/CC-MAIN-2024-38/';


SELECT WARC_Target_URI
FROM ccrawl_db.commoncrawl_warc
WHERE lower(WARC_Target_URI) LIKE '%.de%'

我的问题:这是访问该数据的正确方法吗? 我只想获取带有德语 tld 的网址

amazon-web-services web-crawler amazon-athena tld common-crawl
1个回答
0
投票

按照教程,帮助我找到了答案:

https://commoncrawl.org/blog/index-to-warc-files-and-urls-in-columnar-format

我使用的以下 SQL 语句:

CREATE DATABASE ccindex

CREATE EXTERNAL TABLE IF NOT EXISTS ccindex (
  url_surtkey                   STRING,
  url                           STRING,
  url_host_name                 STRING,
  url_host_tld                  STRING,
  url_host_2nd_last_part        STRING,
  url_host_3rd_last_part        STRING,
  url_host_4th_last_part        STRING,
  url_host_5th_last_part        STRING,
  url_host_registry_suffix      STRING,
  url_host_registered_domain    STRING,
  url_host_private_suffix       STRING,
  url_host_private_domain       STRING,
  url_protocol                  STRING,
  url_port                      INT,
  url_path                      STRING,
  url_query                     STRING,
  fetch_time                    TIMESTAMP,
  fetch_status                  SMALLINT,
  content_digest                STRING,
  content_mime_type             STRING,
  content_mime_detected         STRING,
  content_charset               STRING,
  content_languages             STRING,
  warc_filename                 STRING,
  warc_record_offset            INT,
  warc_record_length            INT,
  warc_segment                  STRING)
PARTITIONED BY (
  crawl                         STRING,
  subset                        STRING)
STORED AS parquet
LOCATION 's3://commoncrawl/cc-index/table/cc-main/warc/';

MSCK REPAIR TABLE ccindex

select url
    from ccindex
   where crawl = 'CC-MAIN-2024-38'
     and subset = 'warc'
     and url_host_tld = 'de';
© www.soinside.com 2019 - 2024. All rights reserved.