我用 athena 查询 https://commoncrawl.org/overview 数据超时...如果成功,每次查询将花费我 1000 美元...每 TB 5 美元,200 TB(?) ...其实太多了
这就是我所做的:
CREATE DATABASE CommonData20102024;
CREATE EXTERNAL TABLE IF NOT EXISTS CommonData20102024.commoncrawl_warc (
WARC_Type STRING,
WARC_Date STRING,
WARC_Record_ID STRING,
Content_Length INT,
WARC_Concurrent_To STRING,
Content_Type STRING,
WARC_Block_Digest STRING,
WARC_Payload_Digest STRING,
WARC_IP_Address STRING,
WARC_Refers_To STRING,
WARC_Target_URI STRING,
WARC_Truncated STRING,
WARC_Warcinfo_ID STRING,
WARC_Filename STRING,
WARC_Profile STRING,
WARC_Identified_Payload_Type STRING,
Payload STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
"input.regex" = "..."
)
LOCATION 's3://commoncrawl/crawl-data/CC-MAIN-2024-38/';
SELECT WARC_Target_URI
FROM ccrawl_db.commoncrawl_warc
WHERE lower(WARC_Target_URI) LIKE '%.de%'
我的问题:这是访问该数据的正确方法吗? 我只想获取带有德语 tld 的网址
按照教程,帮助我找到了答案:
https://commoncrawl.org/blog/index-to-warc-files-and-urls-in-columnar-format
我使用的以下 SQL 语句:
CREATE DATABASE ccindex
CREATE EXTERNAL TABLE IF NOT EXISTS ccindex (
url_surtkey STRING,
url STRING,
url_host_name STRING,
url_host_tld STRING,
url_host_2nd_last_part STRING,
url_host_3rd_last_part STRING,
url_host_4th_last_part STRING,
url_host_5th_last_part STRING,
url_host_registry_suffix STRING,
url_host_registered_domain STRING,
url_host_private_suffix STRING,
url_host_private_domain STRING,
url_protocol STRING,
url_port INT,
url_path STRING,
url_query STRING,
fetch_time TIMESTAMP,
fetch_status SMALLINT,
content_digest STRING,
content_mime_type STRING,
content_mime_detected STRING,
content_charset STRING,
content_languages STRING,
warc_filename STRING,
warc_record_offset INT,
warc_record_length INT,
warc_segment STRING)
PARTITIONED BY (
crawl STRING,
subset STRING)
STORED AS parquet
LOCATION 's3://commoncrawl/cc-index/table/cc-main/warc/';
MSCK REPAIR TABLE ccindex
select url
from ccindex
where crawl = 'CC-MAIN-2024-38'
and subset = 'warc'
and url_host_tld = 'de';