我有一堆数据(100 列和大约 30M 行),我想从 S3 复制。复制速度非常慢 -3-4 小时。我在目标表上定义了一个 dist 键和排序键。 我想知道我应该研究什么来加快复制速度? 我使用的是带有 1 个节点的 ra3.xplus。
这是架构:
CREATE TABLE curation.customer_transaction_v2 (
uuid varchar(100) NOT NULL,
custid varchar(50) NOT NULL,
is_cancelled bool NULL DEFAULT false,
transaction_amount_usd numeric(17, 2) NULL,
transaction_amount numeric(17, 2) NULL,
transaction_date date NULL,
system_of_record text NULL,
is_reseller bool NULL DEFAULT false,
is_in_production bool NULL DEFAULT false,
created_on timestamptz NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_on timestamptz NULL DEFAULT CURRENT_TIMESTAMP,
created_by text NULL DEFAULT CURRENT_USER,
updated_by text NULL DEFAULT CURRENT_USER,
supplier_uuid varchar(100) NULL,
oem_uuid varchar(100) NULL,
vendor_uuid varchar(100) NULL,
unique_id text NULL,
cxo_legacy_unique_id text NULL,
cxo_status_code text NULL DEFAULT 'new'::text,
batch_part_id int8 NULL,
batchid int4 NULL,
transaction_description varchar(1000) NULL,
deployed_to_incight timestamptz NULL,
deployed_to_borg timestamptz NULL,
transaction_type text NOT NULL,
business_unit text NULL,
source_entity text NULL,
line_item_amount_usd numeric(17, 2) NULL,
custid_fully_qualified text NULL,
additional_description varchar(1000) NULL,
approval_status text NULL,
approver text NULL,
cost_center_code varchar(1000) NULL,
cost_center_name varchar(1000)NULL,
cost_center_parent_code varchar(1000) NULL,
currency_code text NULL,
conversion_rate numeric NULL,
cxo_product_description varchar(1000) NULL,
cxo_product_id int8 NULL,
department_code text NULL,
department_name varchar(1000) NULL,
discount_amount numeric NULL,
expense_type text NULL,
gl_code varchar(1000) NULL,
gl_name text NULL,
gl_posting_date date NULL,
invoice_id text NULL,
invoice_line_item_id text NULL,
invoiced_line_item_amount numeric NULL,
line_item_amount numeric NULL,
line_item_description varchar(65535) NULL,
line_item_discount_amount numeric NULL,
line_item_id text NULL,
line_item_product_part_number text NULL,
line_item_quantity numeric NULL,
line_item_tax_amount numeric NULL,
line_item_unit_of_measure text NULL,
line_item_unit_price numeric NULL,
po_id text NULL,
po_line_item_id text NULL,
requisition_line_item_id text NULL,
requisition_id text NULL,
shipping_amount numeric NULL,
submitter text NULL,
tax_amount numeric NULL,
total_amount numeric NULL,
vendor_identifier varchar(65535) NULL,
vendor_name varchar(65535) NULL,
document_date date NULL,
document_id varchar(65535) NULL,
document_description varchar(65535) NULL,
transaction_vendor_name varchar(65535) NULL,
paid_date date NULL,
manufacturer_name varchar(65535) NULL,
id int8 NOT NULL,
cxo_product_uuid varchar(100) NULL,
closest_supplier_uuid varchar(100) NULL,
closest_oem_uuid varchar(100) NULL,
import_uuid varchar(100) NULL,
product_uuid varchar(100) NULL,
curation_stale bool NULL DEFAULT true,
finalized bool NOT NULL DEFAULT false,
invoice_due_date date NULL,
training_text varchar(1000) NULL,
custom_fields super NULL,
cxo_duplicate_check_field varchar(1000) NULL,
cxo_notes varchar(1000) NULL,
force_recalc bool NULL DEFAULT false,
is_date_null bool NULL,
is_date_bad_range bool NULL,
is_date_in_error bool NULL,
import_file_uuid varchar(100) NULL,
transaction_vendor_identifier varchar(65535) NULL,
training_text_hash text NULL,
cancel_reason text NULL,
generated_description varchar(1000) NULL,
generated_vendor_name varchar(1000) NULL,
generated_vendor_short_name varchar(1000) NULL,
closest_supplier_confidence_level numeric NULL,
closest_oem_confidence_level numeric NULL,
partner_reseller_name varchar(1000) NULL,
partner_reseller_uuid varchar(100) NULL,
end_customer_name varchar(1000) NULL,
end_customer_uuid varchar(100) NULL,
unspsc_product_code varchar(1000) NULL,
line_item_product_sku varchar(1000) NULL,
generated_end_customer_name varchar(1000) NULL,
generated_partner_reseller_name varchar(1000) NULL,
closest_end_customer_confidence_level numeric NULL,
closest_partner_reseller_confidence_level numeric NULL,
closest_end_customer_uuid varchar(100) NULL,
closest_partner_reseller_uuid varchar(100) NULL,
distributor_name varchar(1000) NULL,
generated_description_hash varchar(1000) NULL,
cxo_category_override bool NULL DEFAULT false,
client_category_override bool NULL DEFAULT false,
client_oem_override bool NULL DEFAULT false,
client_oem_override_reason varchar(1000) NULL,
cxo_oem_override bool NULL DEFAULT false,
cxo_oem_override_reason varchar(1000) NULL,
client_category_override_reason varchar(1000) NULL,
cxo_category_override_reason varchar(1000) NULL,
pickle_id text NULL,
pii_individual_line_item_description_flag varchar NULL,
line_item_description_deidentified varchar NULL,
pii_individual_supplier_short_name_flag varchar NULL,
pii_individual_oem_short_name_flag varchar NULL,
marketplace_uuid varchar(100) NULL,
channel_partner_uuid varchar(100) null
)
DISTSTYLE KEY
DISTKEY (custid)
SORTKEY (
custid
, document_date
, paid_date
, gl_posting_date
)
I use the following copy:
COPY curation.customer_transaction_v2
FROM 's3://<bucket>customer_transaction_2'
CREDENTIALS '<>'
CSV DELIMITER ','
我使用“batch_id”列分割文件进行分布式副本
需要考虑的一些事情: