这是我的之前问题的后续
我已经成功实施了一个可行的解决方案:
package com.test;
import java.io.*;
import java.util.*;
public class LineGroupProcessor {
private LineGroupProcessor() {
}
public static void main(String[] args) {
validateArgs(args);
List<String[]> validRows = readValidRows(args[0]);
UnionFind unionFind = new UnionFind(validRows.size());
for (int i = 0; i < validRows.size(); i++) {
processRow(validRows, Collections.emptyMap(), unionFind, i);
}
writeOutput(groupAndSortRows(validRows, unionFind));
}
private static void validateArgs(String[] args) {
if (args.length == 0) {
throw new IllegalArgumentException("No input file provided. Please specify a text or CSV file.");
}
String filePath = args[0];
if (!filePath.endsWith(".txt") && !filePath.endsWith(".csv")) {
throw new IllegalArgumentException("Invalid file type. Please provide a text or CSV file.");
}
File file = new File(filePath);
if (!file.exists() || !file.isFile()) {
throw new IllegalArgumentException("File does not exist or is not a valid file: " + filePath);
}
}
private static List<String[]> readValidRows(String filePath) {
List<String[]> rows = new ArrayList<>();
try (BufferedReader br = new BufferedReader(new FileReader(filePath))) {
String line;
while ((line = br.readLine()) != null) {
String[] columns = line.split(";");
if (isValidRow(columns)) {
rows.add(columns);
}
}
} catch (IOException e) {
e.printStackTrace();
}
return rows;
}
private static boolean isValidRow(String[] columns) {
for (String column : columns) {
if (column.isEmpty() && !column.matches("^\"\\d{11}\"$")) {
return false;
}
}
return true;
}
private static void processRow(List<String[]> rows, Map<String, Integer> columnValueMap, UnionFind uf, int rowIndex) {
String[] row = rows.get(rowIndex);
for (int j = 0; j < row.length; j++) {
String value = row[j].trim();
if (!value.isEmpty() && !value.equals("\"\"")) {
StringBuilder keyBuilder = new StringBuilder();
keyBuilder.append(j).append(",").append(value);
String key = keyBuilder.toString();
if (columnValueMap.containsKey(key)) {
int prevRowIdx = columnValueMap.get(key);
uf.union(rowIndex, prevRowIdx);
} else {
columnValueMap.put(key, rowIndex);
}
}
}
}
private static List<Set<String>> groupAndSortRows(List<String[]> rows, UnionFind uf) {
Map<Integer, Set<String>> groups = new HashMap<>();
for (int i = 0; i < rows.size(); i++) {
int group = uf.find(i);
groups.computeIfAbsent(group, k -> new HashSet<>()).add(Arrays.toString(rows.get(i)));
}
List<Set<String>> sortedGroups = new ArrayList<>(groups.values());
sortedGroups.sort((g1, g2) -> Integer.compare(g2.size(), g1.size()));
return sortedGroups;
}
private static void writeOutput(List<Set<String>> sortedGroups) {
long groupsWithMoreThanOneRow = sortedGroups.stream().filter(group -> group.size() > 1).count();
try (PrintWriter writer = new PrintWriter("output.txt")) {
writer.println("Total number of groups with more than one element: " + groupsWithMoreThanOneRow);
writer.println();
int groupNumber = 1;
for (Set<String> group : sortedGroups) {
writer.println("Group " + groupNumber);
for (String row : group) {
writer.println(row);
}
writer.println();
groupNumber++;
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
package com.test;
public class UnionFind {
private final int[] parent;
private final int[] rank;
public UnionFind(int size) {
parent = new int[size];
rank = new int[size];
for (int i = 0; i < size; i++) {
parent[i] = i;
rank[i] = 0;
}
}
public int find(int index) {
if (parent[index] != index) {
parent[index] = find(parent[index]);
}
return parent[index];
}
public void union(int index1, int index2) {
int element1 = find(index1);
int element2 = find(index2);
if (element1 != element2) {
if (rank[element1] > rank[element2]) {
parent[element2] = element1;
} else if (rank[element1] < rank[element2]) {
parent[element1] = element2;
} else {
parent[element2] = element1;
rank[element1]++;
}
}
}
}
该程序有特定要求:应在 30 秒内完成并最多使用 1GB 内存(-Xmx1G)。
运行 100 万行和 1000 万行的测试数据集时,出现以下错误:
> Task :com.test.LineGroupProcessor.main()
Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
at com.test.LineGroupProcessor.lambda$groupAndSortRows$0(LineGroupProcessor.java:85)
at com.test.LineGroupProcessor$$Lambda/0x000002779d000400.apply(Unknown Source)
at java.base/java.util.HashMap.computeIfAbsent(HashMap.java:1228)
at com.test.LineGroupProcessor.groupAndSortRows(LineGroupProcessor.java:85)
at com.test.LineGroupProcessor.main(LineGroupProcessor.java:19)
> Task :com.test.LineGroupProcessor.main()
Exception in thread "main" java.lang.OutOfMemoryError: Java heap space: failed reallocation of scalar replaced objects
at java.base/java.util.HashMap.computeIfAbsent(HashMap.java:1222)
at com.test.LineGroupProcessor.groupAndSortRows(LineGroupProcessor.java:85)
at com.test.LineGroupProcessor.main(LineGroupProcessor.java:19)
如何优化代码以保持在 1GB 内存限制之内?
您目前的做法(大致)
List<String[]>
– 所以如果它有 100 万行,你将得到一个包含 100 万个元素的列表UnionFind
HashMap
,其中包含行输入的副本(文件中 100 万行中的原始行)一些观察
建议
更改步骤1
UnionFind
对象。processRow()
保留每行的带引号的值,因此 "111"
而不是简单的 111
。也许您需要保留数据中的引号(?),但如果不需要,您可以通过删除前导和尾随 "
字符来稍微减小每个字符串的大小。"123"
的数据)。更改步骤3
groups.computeIfAbsent(group, k -> new HashSet<>()).add(Arrays.toString(rows.get(i)));
,它使用行号“i”的 String[]
版本并从中构建一个新字符串 (Arrays.toString()
)。看起来意图是捕获行号“i”很重要,并且不需要存储行“i”内容的字符串副本。如果您可以捕获 Set<Integer>
而不是 Set<String>
,您将使用更少的内存。虽然这个具体建议可能适用于您的情况,也可能不适用于您的情况,但以下内容应该会有所帮助:仅存储对数据进行分组所需的内容。更改步骤4
探索压缩数据