spark读取hdfs时,通过重写FileInputFormat<LongWritable, Text>类,实现自定义TextInputFormat,对读取的hdfs文件的切片进行过滤,从而起到指定读取hdfs文件的效果。
val value: RDD[(LongWritable, Text)] = sc.newAPIHadoopFile[LongWritable, Text, TextInputFormat](url)
重写TextInputForamt:
public class MyFileInputFormat extends FileInputFormat<LongWritable, Text> {
@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
String delimiter = context.getConfiguration().get("textinputformat.record.delimiter");
byte[] recordDelimiterBytes = null;
if (null != delimiter) {
recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8);
}
return new LineRecordReader(recordDelimiterBytes);
}
&#