我有几个需要解析的 CSV 文件。稍后用于 MYSQL 中的插入。我已经编写了一个解析器,但我想尽可能避免代码重复
我已经想到我应该使用抽象类,或者也许是工厂,但我无法真正确定设计这个的最佳方法。
这是我的解析器:
public class LocusAliasCsvParser {
private static final String[] FILE_HEADER_MAPPING = {"id", "locusID", "organismid", "variable", "alias"};
private static final String ID = "id";
private static final String LOCUS_ID = "locusID";
private static final String ORGANISM_ID = "organismid";
private static final String VARIABLE = "variable";
private static final String ALIAS = "alias";
public static List<AliasLocus> readCsvFile(String fileName) {
FileReader fileReader = null;
CSVParser csvFileParser = null;
CSVFormat csvFileFormat = CSVFormat.DEFAULT.withHeader(FILE_HEADER_MAPPING);
List<AliasLocus> aliases = new ArrayList();
try {
fileReader = new FileReader(fileName);
csvFileParser = new CSVParser(fileReader, csvFileFormat);
//Get a list of CSV file records
List csvRecords = csvFileParser.getRecords();
//Read the CSV file. Header is ignored (i == 1)
for (int i = 1; i < csvRecords.size(); i++) {
CSVRecord record = (CSVRecord) csvRecords.get(i);
AliasLocus aliasLocus = new AliasLocus(Integer.parseInt(record.get(ID)),
record.get(LOCUS_ID),
record.get(ORGANISM_ID),
record.get(VARIABLE),
record.get(ALIAS));
aliases.add(aliasLocus);
}
} catch (Exception e) {
System.out.println("Error in CsvFileReader !!!");
e.printStackTrace();
} finally {
try {
fileReader.close();
csvFileParser.close();
} catch (IOException e) {
System.out.println("Error while closing fileReader/csvFileParser !!!");
e.printStackTrace();
}
}
return aliases;
}
每次改变的事情是:
public class LocusAliasCsvParser {
private static final String[] FILE_HEADER_MAPPING = {"id", "locusID", "organismid", "variable", "alias"};
private static final String ID = "id";
private static final String LOCUS_ID = "locusID";
private static final String ORGANISM_ID = "organismid";
private static final String VARIABLE = "variable";
private static final String ALIAS = "alias";
和:
public static List<AliasLocus> readCsvFile(String fileName) {
AliasLocus aliasLocus = new AliasLocus(Integer.parseInt(record.get(ID)),
record.get(LOCUS_ID),
record.get(ORGANISM_ID),
record.get(VARIABLE),
record.get(ALIAS));
有人可以建议最好的设计模式或结构来减少代码重复吗?
谢谢
您应该使用接口分离不同的关注点并实现读取 csv 文件的模板方法。
让我们通过 6 个步骤建立一个简单的框架。
您需要一个知道如何获取 csv 结构的类。
public interface CsvMetadataSource {
public CsvMetadata getCsvMetadata();
}
您将需要一个可以解析 csv 行的类。 1. 中的
CsvMetadata
是放置该逻辑的好地方。
public class CsvMetadata {
private List<String> columns;
public CsvMetadata(List<String> columns) {
this.columns = columns;
}
public Map<String, String> parseLine(String line) {
// simple implementation
String[] values = line.split(",");
Map<String, String> record = new HashMap<>();
for (int i = 0; i < columns.size(); i++) {
String column = columns.get(i);
String value = null;
if (i < values.length) {
value = values[i];
}
record.put(column, value);
}
return record;
}
}
您需要一个可以将解析的行映射到对象的类。知道行号也可能有用。
public interface CsvRecordMapper<T> {
public T map(Map<String, String> csvRecord, int lineNumber);
}
您需要一个知道如何处理映射对象的类。
public interface CsvObjectCallback<T> {
public void process(T object);
}
您需要一个类来实现读取 csv 数据的模板方法,并且可以通过实现上述接口来扩展该类。如果有一个方便的方法来将所有对象读取为列表也可能会很好。
public class CsvReader {
private CsvMetadataSource csvMetadataSource = null;
public CsvReader() {
}
public CsvReader(CsvMetadataSource csvMetadataSource) {
this.csvMetadataSource = csvMetadataSource;
}
public <T> List<T> readAll(Reader csvInputReader, CsvRecordMapper<T> csvLineMapper) throws IOException {
CollectCsvObjectCallback<T> collectCsvObjectCallback = new CollectCsvObjectCallback<>();
read(csvInputReader, csvLineMapper, collectCsvObjectCallback);
return collectCsvObjectCallback.getObjects();
}
public <T> void read(Reader csvInputReader, CsvRecordMapper<T> csvLineMapper,
CsvObjectCallback<T> csvObjectCallback) throws IOException {
try (BufferedReader lineReader = new BufferedReader(csvInputReader);) {
CsvMetadataSource effectiveCsvMetadataSource = getCsvMetadataSource(lineReader);
read(csvLineMapper, csvObjectCallback, lineReader, effectiveCsvMetadataSource);
}
}
private CsvMetadataSource getCsvMetadataSource(BufferedReader lineReader) throws IOException {
CsvMetadataSource effectiveCsvMetadataSource = csvMetadataSource;
if (effectiveCsvMetadataSource == null) {
String headerLine = lineReader.readLine();
effectiveCsvMetadataSource = new RowBasedCsvMetadataSource(headerLine);
}
return effectiveCsvMetadataSource;
}
private <T> void read(CsvRecordMapper<T> csvLineMapper, CsvObjectCallback<T> csvObjectCallback,
BufferedReader lineReader, CsvMetadataSource effectiveCsvMetadataSource) throws IOException {
CsvMetadata effectiveCsvMetadata = effectiveCsvMetadataSource.getCsvMetadata();
if (effectiveCsvMetadata != null) {
String line;
int csvRecordNumber = 0;
while ((line = lineReader.readLine()) != null) {
Map<String, String> csvRecordValues = effectiveCsvMetadata.parseLine(line);
T object = csvLineMapper.map(csvRecordValues, csvRecordNumber++);
csvObjectCallback.process(object);
}
}
}
}
class RowBasedCsvMetadataSource implements CsvMetadataSource {
private String row;
public RowBasedCsvMetadataSource(String row) {
this.row = row;
}
@Override
public CsvMetadata getCsvMetadata() {
String[] columns = row.split(",");
return new CsvMetadata(Arrays.asList(columns));
}
}
class CollectCsvObjectCallback<T> implements CsvObjectCallback<T> {
private List<T> objects = new ArrayList<>();
@Override
public void process(T object) {
objects.add(object);
}
public List<T> getObjects() {
return objects;
}
}
最后,您只需实现一个
CsvRecordMapper
即可轻松适应新的 csv 文件格式。例如
public class UserCsvRecordMapper implements CsvRecordMapper<User> {
public User map(Map<String, String> csvRecord, int lineNumber) {
String firstname = csvRecord.get("FIRST NAME");
String lastname = csvRecord.get("LAST NAME");
String username = csvRecord.get("USERNAME");
String email = csvRecord.get("EMAIL ADDRESS");
return new User(firstname, lastname, username, email);
}
}
public class User {
private String firstname;
private String lastname;
private String username;
private String email;
public User(String firstname, String lastname, String username, String email) {
this.firstname = firstname;
this.lastname = lastname;
this.username = username;
this.email = email;
}
public String getFirstname() {
return firstname;
}
public String getLastname() {
return lastname;
}
public String getUsername() {
return username;
}
public String getEmail() {
return email;
}
@Override
public String toString() {
return "User [firstname=" + firstname + ", lastname=" + lastname + ", username=" + username + ", email=" + email
+ "]";
}
}
从客户的角度来看,它很容易使用。
CSV
FIRST NAME,LAST NAME,USERNAME,PASSWORD,EMAIL ADDRESS,PHONE NUMBER,PASSPORT,GROUPS,USERCODE,TITLE,ADDRESS 1 ,ADDRESS 2,CITY,STATE,ZIP
Frank,Riley,friley,changeme,[email protected],123-456-7890,3,"1,3",1040,Teacher,328 Innovation,Suite # 200 ,state college,PA,16803
Steve,Brannigan,sbrannigan,changeme,[email protected],123-456-7890,3,1,1041,Teacher,328 Innovation,Suite # 200 ,state college,PA,16803
Marie,Ambrose,mambrose,changeme,[email protected],123-456-7890,3,1,1042,Teacher,328 Innovation,Suite # 200 ,state college,PA,16803
还有一个简单的主类
public class Main {
public static void main(String[] args) throws IOException {
InputStream csvInputStream = Main.class.getResourceAsStream("example.csv");
InputStreamReader inputStreamReader = new InputStreamReader(csvInputStream);
CsvReader csvReader = new CsvReader();
List<User> users = csvReader.readAll(inputStreamReader, new UserCsvRecordMapper());
for (User user : users) {
System.out.println(user);
}
}
}
结果是
User [firstname=Frank, lastname=Riley, username=friley, [email protected]]
User [firstname=Steve, lastname=Brannigan, username=sbrannigan, [email protected]]
User [firstname=Marie, lastname=Ambrose, username=mambrose, [email protected]]
编辑
它将如何处理在双引号之间包含换行符的 csv,即作为字段的一部分,这是 rfc 4180 所允许的?
我只是用逗号分隔该行。如果字段包含逗号,您将获得比预期列更多的分割值。在这些情况下,您可以再次合并错误拆分的列。您只需计算拆分值中双引号的出现次数。如果是偶数,则您有一个完整的字段,如果是奇数,则必须将其与下一个分割值合并并再次检查偶数计数。
我会将这些方法添加到
CsvMetadata
类中
private String[] splitFields(String line) {
return normalize(line.split(","));
}
private String[] normalize(String[] values) {
List<String> normalized = new ArrayList<>();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < values.length; i++) {
sb.append(values[i]);
if (isEvenDoubleQuoteCount(sb)) {
// Field complete
if (sb.length() > 0) {
if (sb.charAt(0) == '"') {
sb.deleteCharAt(0);
sb.deleteCharAt(sb.length() - 1);
}
}
normalized.add(sb.toString());
sb = new StringBuilder();
} else {
// field was split, so we need to merge it with the next value
sb.append(","); // add the missing split char
}
}
if (sb.length() > 0) {
normalized.add(sb.toString());
}
return normalized.toArray(new String[normalized.size()]);
}
private boolean isEvenDoubleQuoteCount(CharSequence value) {
int doubleQuoteCount = countDoubleQuotes(value);
return doubleQuoteCount % 2 == 0;
}
private int countDoubleQuotes(CharSequence value) {
int count = 0;
for (int i = 0; i < value.length(); i++) {
if (value.charAt(i) == '"') {
count++;
}
}
return count;
}
只需更改
parseLine
方法的第一行即可。
public Map<String, String> parseLine(String line) {
String[] values = splitFields(line);
这是它的联合测试
class CsvMetadataTest {
private CsvMetadata csvMetadata;
@BeforeEach
void setUp() {
csvMetadata = new CsvMetadata(Arrays.asList("a", "b", "c"));
}
@Test
void unenclosedFields() {
String csvLine = "aaa,bbb,ccc";
Map<String, String> parsedLine = csvMetadata.parseLine(csvLine);
assertEquals("aaa", parsedLine.get("a"));
assertEquals("bbb", parsedLine.get("b"));
assertEquals("ccc", parsedLine.get("c"));
}
@Test
void enclosedFields() {
String csvLine = "\"aaa\",\"bbb\",\"ccc\"";
Map<String, String> parsedLine = csvMetadata.parseLine(csvLine);
assertEquals("aaa", parsedLine.get("a"));
assertEquals("bbb", parsedLine.get("b"));
assertEquals("ccc", parsedLine.get("c"));
}
@Test
void enclosedFieldsWithComma() {
String csvLine = "\"a,aa\",\"bb,b\",\"ccc,\"";
Map<String, String> parsedLine = csvMetadata.parseLine(csvLine);
assertEquals("a,aa", parsedLine.get("a"));
assertEquals("bb,b", parsedLine.get("b"));
assertEquals("ccc,", parsedLine.get("c"));
}
@Test
void enclosedFieldsWithLineBreaks() {
String csvLine = "\"a\na\r\na\",\"bbb\n\",\"\nccc\"";
Map<String, String> parsedLine = csvMetadata.parseLine(csvLine);
assertEquals("a\na\r\na", parsedLine.get("a"));
assertEquals("bbb\n", parsedLine.get("b"));
assertEquals("\nccc", parsedLine.get("c"));
}
}
这是我相当简单的解决方案。
声明从 CSVRecord 到每个所需实体的转换器:
class AliasLocusMapper {
public static final String[] FILE_HEADER_MAPPING = {"id", "locusID", "organismid", "variable", "alias"};
private static final String ID = "id";
private static final String LOCUS_ID = "locusID";
private static final String ORGANISM_ID = "organismid";
private static final String VARIABLE = "variable";
private static final String ALIAS = "alias";
public static AliasLocus mapRecord(CSVRecord record) {
return new AliasLocus(Integer.parseInt(record.get(ID)),
record.get(LOCUS_ID),
record.get(ORGANISM_ID),
record.get(VARIABLE),
record.get(ALIAS));
}
}
class ProductMapper { // Product is an example class
public static final String[] FILE_HEADER_MAPPING = {"id", "title", "price"};
private static final String ID = "id";
private static final String TITLE = "title";
private static final String PRICE = "price";
public static Product mapRecord(CSVRecord record) {
return new Product(Integer.parseInt(record.get(ID)),
record.get(TITLE),
record.get(PRICE));
}
}
然后让Parser更加通用
public class AbstractCsvParser {
public <T> List<T> readCsvFile(String fileName, String[] headers, Function<CSVRecord, T> mapper) {
FileReader fileReader = null;
CSVParser csvFileParser = null;
CSVFormat csvFileFormat = CSVFormat.DEFAULT.withHeader(headers);
List<T> entities = new ArrayList<>();
try {
fileReader = new FileReader(fileName);
csvFileParser = new CSVParser(fileReader, csvFileFormat);
//Get a list of CSV file records
List csvRecords = csvFileParser.getRecords();
//Read the CSV file. Header is ignored (i == 1)
for (int i = 1; i < csvRecords.size(); i++) {
CSVRecord record = (CSVRecord) csvRecords.get(i);
T result = mapper.apply(record); // transform to desired enitity
entities.add(result);
}
} catch (Exception e) {
// omitted
}
return entities;
}
}
然后按以下方式使用
AbstractCsvParser parser = new AbstractCsvParser();
List<AliasLocus> aliases = parser.readCsvFile(
"aliases.csv",
AliasLocusMapper.FILE_HEADER_MAPPING,
AliasLocusMapper::mapRecord);
List<Product> products = parser.readCsvFile(
"products.csv",
ProductMapper.FILE_HEADER_MAPPING,
ProductMapper::mapRecord);