我有二进制文件,其中包含地点和坐标(纬度,经度)的名称,每当我使用编码将它解析为String .ascii它不会很好地解析它。 我假设从Float值(坐标)解析失败。
读取InputStream
extension Data { init(reading input: InputStream) { self.init() input.open() let bufferSize = 1024 let buffer = UnsafeMutablePointer<UInt8>.allocate(capacity: bufferSize) while input.hasBytesAvailable { let read = input.read(buffer, maxLength: bufferSize) self.append(buffer, count: read) } buffer.deallocate() input.close() } }要解析的文件
let filepath = Bundle.main.path(forResource: "MN", ofType: "dat") let data = Data.init(reading: InputStream(fileAtPath: filepath)!) let parsedData = String.init(data: data, encoding: .ascii)
任何想法我怎么能以正确的方式解析它?
例如,Java ObjectInputStream具有以下方法:
inputStreamObj.readUTF() inputStreamObj.readFloat()Java的
I have binary files which containing names of place and coordinates ( latitude, longitude ), whenever I parse it to String using encoding .ascii it won't parse it well. I assume that parsing from Float values (coordinates) failing.
Reading InputStream
extension Data { init(reading input: InputStream) { self.init() input.open() let bufferSize = 1024 let buffer = UnsafeMutablePointer<UInt8>.allocate(capacity: bufferSize) while input.hasBytesAvailable { let read = input.read(buffer, maxLength: bufferSize) self.append(buffer, count: read) } buffer.deallocate() input.close() } }File to parse
let filepath = Bundle.main.path(forResource: "MN", ofType: "dat") let data = Data.init(reading: InputStream(fileAtPath: filepath)!) let parsedData = String.init(data: data, encoding: .ascii)Any ideas how could I parse it in correct way ?
For example Java ObjectInputStream have methods called:
inputStreamObj.readUTF() inputStreamObj.readFloat()Java
最满意答案
正如我在评论中所写,您需要阅读规范对象序列化流协议 。
因此,前4个字节表示STREAM_MAGIC,STREAM_VERSION,预期始终为相同的值。 5字节序列0x7A 0xhh 0xhh 0xhh 0xhh表示TC_BLOCKDATALONG(0xhhhhhhhh)。
在解析字符串和浮点数之前,需要连接所有块。
所以,准备DataReader :
(几乎与Sulthan相同,但这正确地对待修改的UTF-8。)
struct DataReader { enum DataReaderError: Error { case invalidFirstByte(byte: UInt16, offset: Int) case invalidFollowingByte case missingFollowingByte case insufficientData } var data: Data var currentPosition: Int init(data: Data) { self.data = data self.currentPosition = 0 } mutating func skipBytes(_ n: Int) { currentPosition += n } private mutating func readBigEndian<T: FixedWidthInteger>() throws -> T { guard currentPosition + MemoryLayout<T>.size <= data.count else { throw DataReaderError.insufficientData } var fixedWithInteger: T = 0 let range: Range<Int> = currentPosition ..< currentPosition + MemoryLayout<T>.size withUnsafeMutableBytes(of: &fixedWithInteger) {ptrT in let uint8Ptr = ptrT.baseAddress!.assumingMemoryBound(to: UInt8.self) data.copyBytes(to: uint8Ptr, from: range) } currentPosition += MemoryLayout<T>.size return fixedWithInteger.bigEndian } mutating func readFloat() throws -> Float { let floatBits: UInt32 = try readBigEndian() return Float(bitPattern: floatBits) } mutating func readUnsignedShort() throws -> Int { let ushortValue: UInt16 = try readBigEndian() return Int(ushortValue) } mutating func readInt() throws -> Int { let intValue: Int32 = try readBigEndian() return Int(intValue) } mutating func readUnsignedByte() throws -> Int { guard currentPosition < data.count else { throw DataReaderError.insufficientData } let byte = data[currentPosition] currentPosition += 1 return Int(byte) } mutating func readBytes(_ n: Int) throws -> Data { guard currentPosition + n <= data.count else { throw DataReaderError.insufficientData } let subdata = data[currentPosition ..< currentPosition+n] currentPosition += n return subdata } mutating func readUTF() throws -> String { //Get byte size of the string let count = try readUnsignedShort() //Decoding Modified UTF-8 var utf16: [UInt16] = [] var offset = 0 while offset < count { let firstByte = UInt16(data[currentPosition + offset]) if firstByte & 0b1_0000000 == 0b0_0000000 { utf16.append(firstByte) offset += 1 } else if firstByte & 0b111_00000 == 0b110_00000 { guard offset + 1 < count else {throw DataReaderError.missingFollowingByte} let secondByte = UInt16(data[currentPosition + offset + 1]) guard secondByte & 0b11_000000 == 0b10_000000 else {throw DataReaderError.invalidFollowingByte} let codeUnit = ((firstByte & 0b000_11111) << 6) | (secondByte & 0b00_111111) utf16.append(codeUnit) offset += 2 } else if firstByte & 0b1111_0000 == 0b1110_0000 { guard offset + 2 < count else {throw DataReaderError.missingFollowingByte} let secondByte = UInt16(data[currentPosition + offset + 1]) guard secondByte & 0b11_000000 == 0b10_000000 else {throw DataReaderError.invalidFollowingByte} let thirdByte = UInt16(data[currentPosition + offset + 2]) guard thirdByte & 0b11_000000 == 0b10_000000 else {throw DataReaderError.invalidFollowingByte} let codeUnit = ((firstByte & 0b0000_1111) << 12) | ((secondByte & 0b00_111111) << 6) | (thirdByte & 0b00_111111) utf16.append(codeUnit) offset += 3 } else { throw DataReaderError.invalidFirstByte(byte: firstByte, offset: currentPosition+offset) } } currentPosition += offset return String(utf16CodeUnits: &utf16, count: utf16.count) } var isAtEnd: Bool { return currentPosition == data.count } }我们可以解析你的MN.dat如下:
let mnUrl = Bundle.main.url(forResource: "MN", withExtension: "dat")! do { let data = try Data(contentsOf: mnUrl) var reader = DataReader(data: data) reader.skipBytes(4) //First collect all blocks var blockData = Data() while !reader.isAtEnd { let contentType = try reader.readUnsignedByte() if contentType == 0x7A {//TC_BLOCKDATALONG let size = try reader.readInt() let block = try reader.readBytes(size) blockData.append(block) } else if contentType == 0x77 {//TC_BLOCKDATA let size = try reader.readUnsignedByte() let block = try reader.readBytes(size) blockData.append(block) } else { print("Unsupported content type") break } } //Then read the contents of blockData var blockReader = DataReader(data: blockData) while !blockReader.isAtEnd { let string = try blockReader.readUTF() print(string) let float1 = try blockReader.readFloat() print(float1) let float2 = try blockReader.readFloat() print(float2) //Use string, float1, float2 as you like } } catch { print(error) }输出:
Albert Lea 43.648 -93.3683 Albertville 45.2377 -93.6544 Alexandria 45.8852 -95.3775 (... no errors...) Woodbury 44.9239 -92.9594 Worthington 43.62 -95.5964 Wyoming 45.3364 -92.9972 Zimmerman 45.4433 -93.59如果二进制数据可能包含其他内容类型,则可能需要修改上面的代码。
As I wrote in the comment, you need to read the spec Object Serialization Stream Protocol.
So, first 4 bytes represents STREAM_MAGIC, STREAM_VERSION, expected to be always the same value. And 5 byte sequence 0x7A 0xhh 0xhh 0xhh 0xhh represents TC_BLOCKDATALONG(0xhhhhhhhh).
And all blocks needs to be concatenated before parsing strings and floats.
So, preparing the DataReader:
(Nearly the same as Sulthan's, but this treats Modified UTF-8 correctly.)
struct DataReader { enum DataReaderError: Error { case invalidFirstByte(byte: UInt16, offset: Int) case invalidFollowingByte case missingFollowingByte case insufficientData } var data: Data var currentPosition: Int init(data: Data) { self.data = data self.currentPosition = 0 } mutating func skipBytes(_ n: Int) { currentPosition += n } private mutating func readBigEndian<T: FixedWidthInteger>() throws -> T { guard currentPosition + MemoryLayout<T>.size <= data.count else { throw DataReaderError.insufficientData } var fixedWithInteger: T = 0 let range: Range<Int> = currentPosition ..< currentPosition + MemoryLayout<T>.size withUnsafeMutableBytes(of: &fixedWithInteger) {ptrT in let uint8Ptr = ptrT.baseAddress!.assumingMemoryBound(to: UInt8.self) data.copyBytes(to: uint8Ptr, from: range) } currentPosition += MemoryLayout<T>.size return fixedWithInteger.bigEndian } mutating func readFloat() throws -> Float { let floatBits: UInt32 = try readBigEndian() return Float(bitPattern: floatBits) } mutating func readUnsignedShort() throws -> Int { let ushortValue: UInt16 = try readBigEndian() return Int(ushortValue) } mutating func readInt() throws -> Int { let intValue: Int32 = try readBigEndian() return Int(intValue) } mutating func readUnsignedByte() throws -> Int { guard currentPosition < data.count else { throw DataReaderError.insufficientData } let byte = data[currentPosition] currentPosition += 1 return Int(byte) } mutating func readBytes(_ n: Int) throws -> Data { guard currentPosition + n <= data.count else { throw DataReaderError.insufficientData } let subdata = data[currentPosition ..< currentPosition+n] currentPosition += n return subdata } mutating func readUTF() throws -> String { //Get byte size of the string let count = try readUnsignedShort() //Decoding Modified UTF-8 var utf16: [UInt16] = [] var offset = 0 while offset < count { let firstByte = UInt16(data[currentPosition + offset]) if firstByte & 0b1_0000000 == 0b0_0000000 { utf16.append(firstByte) offset += 1 } else if firstByte & 0b111_00000 == 0b110_00000 { guard offset + 1 < count else {throw DataReaderError.missingFollowingByte} let secondByte = UInt16(data[currentPosition + offset + 1]) guard secondByte & 0b11_000000 == 0b10_000000 else {throw DataReaderError.invalidFollowingByte} let codeUnit = ((firstByte & 0b000_11111) << 6) | (secondByte & 0b00_111111) utf16.append(codeUnit) offset += 2 } else if firstByte & 0b1111_0000 == 0b1110_0000 { guard offset + 2 < count else {throw DataReaderError.missingFollowingByte} let secondByte = UInt16(data[currentPosition + offset + 1]) guard secondByte & 0b11_000000 == 0b10_000000 else {throw DataReaderError.invalidFollowingByte} let thirdByte = UInt16(data[currentPosition + offset + 2]) guard thirdByte & 0b11_000000 == 0b10_000000 else {throw DataReaderError.invalidFollowingByte} let codeUnit = ((firstByte & 0b0000_1111) << 12) | ((secondByte & 0b00_111111) << 6) | (thirdByte & 0b00_111111) utf16.append(codeUnit) offset += 3 } else { throw DataReaderError.invalidFirstByte(byte: firstByte, offset: currentPosition+offset) } } currentPosition += offset return String(utf16CodeUnits: &utf16, count: utf16.count) } var isAtEnd: Bool { return currentPosition == data.count } }We can parse your MN.dat as follows:
let mnUrl = Bundle.main.url(forResource: "MN", withExtension: "dat")! do { let data = try Data(contentsOf: mnUrl) var reader = DataReader(data: data) reader.skipBytes(4) //First collect all blocks var blockData = Data() while !reader.isAtEnd { let contentType = try reader.readUnsignedByte() if contentType == 0x7A {//TC_BLOCKDATALONG let size = try reader.readInt() let block = try reader.readBytes(size) blockData.append(block) } else if contentType == 0x77 {//TC_BLOCKDATA let size = try reader.readUnsignedByte() let block = try reader.readBytes(size) blockData.append(block) } else { print("Unsupported content type") break } } //Then read the contents of blockData var blockReader = DataReader(data: blockData) while !blockReader.isAtEnd { let string = try blockReader.readUTF() print(string) let float1 = try blockReader.readFloat() print(float1) let float2 = try blockReader.readFloat() print(float2) //Use string, float1, float2 as you like } } catch { print(error) }Output:
Albert Lea 43.648 -93.3683 Albertville 45.2377 -93.6544 Alexandria 45.8852 -95.3775 (... no errors...) Woodbury 44.9239 -92.9594 Worthington 43.62 -95.5964 Wyoming 45.3364 -92.9972 Zimmerman 45.4433 -93.59You may need to modify the code above if your binary data may contain other content types.
更多推荐
发布评论