如何将扩展名为 .DUSMCPUB 的文件导入到 r 中?

问题描述 投票:0回答:3

我正在尝试从国家卫生统计中心导入死亡率多因文件,位于此链接:

https://www.cdc.gov/nchs/data_access/vitalstatsonline.htm#Downloadable

链接到在 NCHS 网站上查找文件的图像

这些文件的扩展名是.DUSMCPUB(例如,2020 年的文件名为“VS20MORT.DUSMCPUB_r20220105”)。如何导入这样的文件?我不熟悉该扩展名。

我尝试使用以下代码导入,但它导致我的 R 程序终止。您能为我提供有关如何导入这些类型的文件的建议吗?

VS20MORT <- read_delim("VS20MORT.DUSMCPUB_r20220105")
r import cdc fixed-width
3个回答
2
投票

感谢@Mel G 分享这种方法。当我尝试运行它时,我意识到死亡率文件包含截至 2020 年的一些新变量(即死者的职业和行业)。这是包含新变量的细微变化。

# Install and load necessary packages
# install.packages("sqldf") # Used to read in DUSMCPUB file
# install.packages("dplyr") # Used for tidy data management
library(sqldf)
library(dplyr)

#Increase memory limit to make space for large file
# memory.limit()
memory.limit(size=20000)

# Create dataframe containing variables for column width, name, and end position
columns <- data.frame(widths=c(19,1,40,2,1,1,2,2,1,4,1,2,2,2,2,1,1,1,16,4,1,1,1,
                               1,34,1,1,4,3,1,3,3,2,1,2,7,7,7,7,7,7,7,7,7,7,7,7,
                               7,7,7,7,7,7,7,7,36,2,1,5,5,5,5,5,5,5,5,5,5,5,5,5,
                               5,5,5,5,5,5,5,1,2,1,1,1,1,33,3,1,1,2,315,4,2,4,2))
columns$names <- c("blank1", # tape locations 1-19
                   "Resident_Status_US",  # tape location 20
                   "blank2", 
                   "Education_1989",
                   "Education_2003",
                   "Education_flag", 
                   "Month_of_Death", 
                   "blank3",
                   "Sex", 
                   "DetailAge",
                   "Age_Substitution_Flag", 
                   "Age_Recode_52",
                   "Age_Recode_27", 
                   "Age_Recode_12",
                   "Infant_Age_Recode_22",
                   "Place_of_Death_and_Status",
                   "Marital_Status",
                   "Day_of_Week_of_Death",
                   "blank4",
                   "Current_Data_Year",
                   "Injury_at_Work",
                   "Manner_of_Death",
                   "Method_of_Disposition",
                   "Autopsy",
                   "blank5",
                   "Activity_Code",
                   "Place_of_Injury",
                   "ICD_Code_10",
                   "Cause_Recode_358",
                   "blank6",
                   "Cause_Recode_113",
                   "Infant_Cause_Recode_130",
                   "Cause_Recode_39",
                   "blank7",
                   "Number_Entity_Axis_Conditions",
                   "Condition_1EA", "Condition_2EA", "Condition_3EA", "Condition_4EA", "Condition_5EA",
                   "Condition_6EA", "Condition_7EA", "Condition_8EA", "Condition_9EA", "Condition_10EA",
                   "Condition_11EA", "Condition_12EA", "Condition_13EA", "Condition_14EA", "Condition_15EA",
                   "Condition_16EA", "Condition_17EA", "Condition_18EA", "Condition_19EA", "Condition_20EA",
                   "blank8",
                   "Number_Record_Axis_Conditions",
                   "blank9",
                   "Condition_1RA", "Condition_2RA", "Condition_3RA", "Condition_4RA", "Condition_5RA",
                   "Condition_6RA", "Condition_7RA", "Condition_8RA", "Condition_9RA", "Condition_10RA",
                   "Condition_11RA", "Condition_12RA", "Condition_13RA", "Condition_14RA", "Condition_15RA",
                   "Condition_16RA", "Condition_17RA", "Condition_18RA", "Condition_19RA", "Condition_20RA",
                   "blank10",
                   "Race",
                   "Bridged_Race_Flag",
                   "Race_Imputation_Flag",
                   "Race_Recode_3",
                   "Race_Recode_5",
                   "blank11",
                   "Hispanic_Origin",
                   "blank12",
                   "Hispanic_Origin_9_Race_Recode",
                   "Race_Recode_40",
                   "blank13",
                   "CensusOcc",
                   "Occ_26",
                   "CensusInd",
                   "Ind_23")

# Read in file using parameters from 'columns' dataframe
mort2020<- read.fwf("VS20MORT.DUSMCPUB_r20220105", widths=columns$widths, stringsAsFactors=F)
# Attach column names to variables
colnames(mort2020) <- columns$names

# Remove blank variables
mort2020x <- mort2020 %>% dplyr::select(-starts_with("blank"))

或者,看起来这些文件大部分时间都以 CSV 格式发布:https://www.nber.org/research/data/mortality-data-vital-statistics-nchs-multiple-cause-death-数据。 2020 年尚未到来,但对于其他年份,将 CSV 读入 R 可能比使用 read.fwf 快得多。


1
投票

数据是固定宽度文件的形式。国家卫生统计中心数据的用户指南包含适当的宽度。我给出的答案是另一个论坛的修改答案,由 @Hack-R 发布。

https://opendata.stackexchange.com/questions/18375/how-can-one-interpret-the-nvss-mortality-multiple-cause-of-death-data-sets

map <- data.frame(widths=c(19, 1,40,2,1,1,2,2,1,1,1,1,1,1,2,2,2,2,1,1,1,16,4,1,1,1,1,34,1,1,4,
                           3,1,3,3,2,1,2,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
                           36,2,1,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,1,2,1,1,1,1,33,3,
                           1,1))
#Set column names 
map$cn <- c("blank", # cols 1-19
            "res_status",  #20
            "blank2", # 21-60
            "ed_v89",#61-62
            "ed_v03",#63
            "ed_flag", #64
            "death_month", #65-66
            "blank3",
            "sex", 
            "age_years",
            "age_months", 
            "age_3",
            "age_4", 
            "age_sub_flag", 
            "age_recode_52", 
            "age_recode_27",
            "age_recode_12", 
            "infant_age_recode_22", 
            "place_of_death", 
            "marital_status",
            "death_day", 
            "blank4", 
            "current_year", 
            "work_injury", 
            "death_manner", 
            "disposition",
            "autopsy", 
            "blank5", 
            "activity_code", 
            "place_injured", 
            "icd_cause_of_death", 
            "cause_recode358",
            "blank6", 
            "cause_recode113", 
            "infant_cause_recode130", 
            "cause_recode39", 
            "blank7",
            "num_entity_axis",
            "cond1","cond2","cond3","cond4","cond5","cond6","cond7","cond8","cond9","cond10",
            "cond11","cond12","cond13","cond14","cond15","cond16","cond17","cond18","cond19",
            "cond20",
            "blank7",
            "num_rec_axis_cond", 
            "blank8", 
            "acond1", "acond2", "acond3",  "acond4",  "acond5",  "acond6",  "acond7",  
            "acond8",  "acond9", "acond10", "acond11", "acond12", "acond13", "acond14", 
            "acond15", "acond16", "acond17", "acond18", "acond19", "acond20", 
            "blank9",
            "race",
            "bridged_race_flag",
            "race_imp_flag", 
            "race_recode3", 
            "race_recode5", 
            "blank10",
            "hisp",
            "blank11", 
            "hisp_recode")


#Import the file 
mort2020 <- read_fwf("./data/original/VS20MORT.DUSMCPUB_r20220105", fwf_widths(map$widths, map$cn))


0
投票

更新2022年数据:

#Increase memory limit to make space for large file
# memory.limit()
memory.limit(size=20000)

# Create dataframe containing variables for column width, name, and end position
columns <- data.frame(widths=c(18,1,1,42,1,1,2,2,1,4,1,2,2,2,2,1,1,1,16,4,1,1,1,
                           1,34,1,1,4,3,1,3,3,2,1,2,7,7,7,7,7,7,7,7,7,7,7,7,
                           7,7,7,7,7,7,7,7,36,2,1,5,5,5,5,5,5,5,5,5,5,5,5,5,
                           5,5,5,5,5,5,5,4,1,1,1,33,3,2,2,315,4,2,4,2))
columns$names <- c("blank1", # tape locations 1-18
               "record_type", #19
               "Resident_Status_US",  # tape location 20
               "blank2", 
               "Education_2003", #1
               "Education_flag", #1
               "Month_of_Death",  #2
               "blank3", #2
               "Sex", #1
               "DetailAge",#4
               "Age_Substitution_Flag", #1
               "Age_Recode_52", # 2
               "Age_Recode_27", #2
               "Age_Recode_12", #2
               "Infant_Age_Recode_22", #2
               "Place_of_Death_and_Status", #1
               "Marital_Status", #1
               "Day_of_Week_of_Death", #1
               "blank4", #16
               "Current_Data_Year", #4
               "Injury_at_Work",#1
               "Manner_of_Death", #1
               "Method_of_Disposition", #1
               "Autopsy", #1
               "blank5", #34
               "Activity_Code", #1
               "Place_of_Injury", #1
               "ICD_Code_10", #4
               "Cause_Recode_358", #3
               "blank6", #1
               "Cause_Recode_113", #3
               "Infant_Cause_Recode_130", #3
               "Cause_Recode_39", #2
               "blank7", #1
               "Number_Entity_Axis_Conditions", #2
               "Condition_1EA", "Condition_2EA", "Condition_3EA", "Condition_4EA", "Condition_5EA",
               "Condition_6EA", "Condition_7EA", "Condition_8EA", "Condition_9EA", "Condition_10EA",
               "Condition_11EA", "Condition_12EA", "Condition_13EA", "Condition_14EA", "Condition_15EA",
               "Condition_16EA", "Condition_17EA", "Condition_18EA", "Condition_19EA", "Condition_20EA",
               "blank8",
               "Number_Record_Axis_Conditions",
               "blank9",
               "Condition_1RA", "Condition_2RA", "Condition_3RA", "Condition_4RA", "Condition_5RA",
               "Condition_6RA", "Condition_7RA", "Condition_8RA", "Condition_9RA", "Condition_10RA",
               "Condition_11RA", "Condition_12RA", "Condition_13RA", "Condition_14RA", "Condition_15RA",
               "Condition_16RA", "Condition_17RA", "Condition_18RA", "Condition_19RA", "Condition_20RA",
               "blank10", #4
               "race_imputed", #1
               "blank10_1", #1,
               "race_recod_6", #1
               "blank11" #33
               "Hispanic_Origin",#3
               "Hispanic_Origin_9_Race_Recode", #2
               "Race_Recode_40",#2
               "blank13", #315
               "CensusOcc", # 4
               "Occ_26", #2
               "CensusInd", #4
               "Ind_23") #2

# Read in file using parameters from 'columns' dataframe
mort2022<- read.fwf(here::here(data_path, "VS22MORT.DUSMCPUB_r20240307"), 
widths=columns$widths, stringsAsFactors=F)
# Attach column names to variables
colnames(mort2022) <- columns$names

# Remove blank variables
mort2020x <- mort2022 %>% dplyr::select(-starts_with("blank")) %>%clean_names()    
最新问题
© www.soinside.com 2019 - 2025. All rights reserved.