1
0
mirror of https://github.com/metabolist/metatext synced 2025-01-01 12:37:18 +01:00

Implement serializable Bloom filter

This commit is contained in:
Justin Mazzocchi 2020-09-05 04:19:04 -07:00
parent 4e029b40ac
commit 781be478ba
No known key found for this signature in database
GPG Key ID: E223E6937AAFB01C
6 changed files with 186 additions and 0 deletions

View File

@ -86,6 +86,7 @@
D047FA8C24C3E21200AF17C5 /* Metatext.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = Metatext.app; sourceTree = BUILT_PRODUCTS_DIR; };
D0666A2124C677B400F3F04B /* Tests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = Tests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
D0666A2524C677B400F3F04B /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
D07E164425037264008B10D0 /* SerializableBloomFilter */ = {isa = PBXFileReference; lastKnownFileType = folder; path = SerializableBloomFilter; sourceTree = "<group>"; };
D085C3BB25008DEC008A6C5E /* DB */ = {isa = PBXFileReference; lastKnownFileType = folder; path = DB; sourceTree = "<group>"; };
D0BDF66524FD7A6400C7FA1C /* ServiceLayer */ = {isa = PBXFileReference; lastKnownFileType = folder; path = ServiceLayer; sourceTree = "<group>"; };
D0BEB1F224F8EE8C001B0F04 /* AttachmentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AttachmentView.swift; sourceTree = "<group>"; };
@ -181,6 +182,7 @@
D047FA7F24C3E21000AF17C5 = {
isa = PBXGroup;
children = (
D07E164425037264008B10D0 /* SerializableBloomFilter */,
D0C7D45224F76169001EBDBB /* Assets.xcassets */,
D085C3BB25008DEC008A6C5E /* DB */,
D0C7D46824F76169001EBDBB /* Extensions */,

5
SerializableBloomFilter/.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
.DS_Store
/.build
/Packages
/*.xcodeproj
xcuserdata/

View File

@ -0,0 +1,25 @@
// swift-tools-version:5.3
import PackageDescription
let package = Package(
name: "SerializableBloomFilter",
platforms: [
.iOS(.v14),
.macOS(.v11)
],
products: [
.library(
name: "SerializableBloomFilter",
targets: ["SerializableBloomFilter"])
],
dependencies: [],
targets: [
.target(
name: "SerializableBloomFilter",
dependencies: []),
.testTarget(
name: "SerializableBloomFilterTests",
dependencies: ["SerializableBloomFilter"])
]
)

View File

@ -0,0 +1,67 @@
// Copyright © 2020 Metabolist. All rights reserved.
// Adapted from https://github.com/dduan/BitArray
import Foundation
struct Bits {
let count: Int
private var bytes: [UInt8]
init(count: Int) {
self.count = count
var (byteCount, bitRemainder) = count.quotientAndRemainder(dividingBy: Self.bitsInByte)
byteCount += bitRemainder > 0 ? 1 : 0
bytes = [UInt8](repeating: 0, count: byteCount)
}
init(bytes: [UInt8], count: Int) {
self.bytes = bytes
self.count = count
}
}
extension Bits {
var data: Data { Data(bytes) }
subscript(index: Int) -> Bool {
get {
let (byteCount, bitPosition) = index.quotientAndRemainder(dividingBy: Self.bitsInByte)
return bytes[byteCount] & mask(index: bitPosition) > 0
}
set {
let (byteCount, bitPosition) = index.quotientAndRemainder(dividingBy: Self.bitsInByte)
if newValue {
bytes[byteCount] |= mask(index: bitPosition)
} else {
bytes[byteCount] &= ~mask(index: bitPosition)
}
}
}
}
private extension Bits {
static let bitsInByte = 8
func mask(index: Int) -> UInt8 {
switch index {
case 0: return 0b00000001
case 1: return 0b00000010
case 2: return 0b00000100
case 3: return 0b00001000
case 4: return 0b00010000
case 5: return 0b00100000
case 6: return 0b01000000
case 7: return 0b10000000
default:
fatalError("Invalid index: \(index)")
}
}
}

View File

@ -0,0 +1,56 @@
// Copyright © 2020 Metabolist. All rights reserved.
import Foundation
// https://en.wikipedia.org/wiki/Bloom_filter
// https://khanlou.com/2018/09/bloom-filters/
// This implementation uses deterministic hashing functions so it can be serialized / deserialized
struct SerializableBloomFilter {
private var items: Bits
init() {
items = Bits(count: Self.itemCount)
}
init(serialization: Data) throws {
items = Bits(bytes: Array(serialization), count: Self.itemCount)
}
}
extension SerializableBloomFilter {
var serialization: Data { items.data }
mutating func insert(_ newMember: String) {
for index in Self.indices(newMember) {
items[index] = true
}
}
func contains(_ member: String) -> Bool {
Self.indices(member).map { items[$0] }.allSatisfy { $0 }
}
}
private extension SerializableBloomFilter {
static let itemCount = 1024
static let hashFunctions = [djb2, sdbm]
static func indices(_ string: String) -> [Int] {
hashFunctions.map { abs($0(string)) % itemCount }
}
}
// https://gist.github.com/kharrison/2355182ac03b481921073c5cf6d77a73
private func djb2(_ string: String) -> Int {
string.unicodeScalars.map(\.value).reduce(5381) {
($0 << 5) &+ $0 &+ Int($1)
}
}
private func sdbm(_ string: String) -> Int {
string.unicodeScalars.map(\.value).reduce(0) {
Int($1) &+ ($0 << 6) &+ ($0 << 16) - $0
}
}

View File

@ -0,0 +1,31 @@
@testable import SerializableBloomFilter
import XCTest
final class SerializableBloomFilterTests: XCTestCase {
func testContains() {
var filter = SerializableBloomFilter()
filter.insert("lol")
filter.insert("ok")
XCTAssert(filter.contains("lol"))
XCTAssert(filter.contains("ok"))
XCTAssertFalse(filter.contains("wtf"))
XCTAssertFalse(filter.contains("no"))
}
func testSerialization() throws {
var filter = SerializableBloomFilter()
filter.insert("lol")
filter.insert("ok")
let serialization = filter.serialization
let deserializedFilter = try SerializableBloomFilter(serialization: serialization)
XCTAssert(deserializedFilter.contains("lol"))
XCTAssert(filter.contains("ok"))
XCTAssertFalse(deserializedFilter.contains("wtf"))
XCTAssertFalse(filter.contains("no"))
}
}