2021-05-29:最常使用的K个单词II。在实时数据流中找到最常使用的k个单词,实现TopK类中的三个方法: TopK(k), 构造方法。add(word),增加一个新单词。topk(),得到当前最常使用的k个单词。如果两个单词有相同的使用频率,按字典序排名。
福大大 答案2021-05-30:
方法一:
redis的sorted set。hash+跳表实现计数和查找。无代码。
方法二:
节点结构体:有字符串和词频。
词频表:key是字符串,value是节点。
堆:节点数组。刚开始,我以为是大根堆。采用小根堆,如果比堆顶还小,是进不了小根堆的。
反向表:key是节点,value是在堆中的索引。
有代码。
代码用golang编写。代码如下:
package main import ( "fmt" "sort" ) func main() { a := NewTopK(2) a.add("fdd") a.add("moon") a.add("moonfdd") a.add("moonfdd") ret := a.topk() for i := 0; i < len(ret); i++ { fmt.Println(ret[i]) } } type TopK struct { //堆 heap []*Node heapSize int //字,次数 wordNodeMap map[string]*Node //反向表 nodeIndexMap map[*Node]int } func NewTopK(k int) *TopK { ret := &TopK{} ret.heap = make([]*Node, k) ret.wordNodeMap = make(map[string]*Node) ret.nodeIndexMap = make(map[*Node]int) return ret } func (this *TopK) add(word string) { if len(this.heap) == 0 { return } var curNode *Node preIndex := -1 curNode = this.wordNodeMap[word] //词频表 反向表 if curNode == nil { curNode = &Node{word, 1} this.wordNodeMap[word] = curNode this.nodeIndexMap[curNode] = -1 } else { curNode.Times++ preIndex = this.nodeIndexMap[curNode] } //小根堆 if preIndex == -1 { if this.heapSize == len(this.heap) { if this.compare(curNode, this.heap[0]) { //不用管了 return } curNode, this.heap[0] = this.heap[0], curNode this.nodeIndexMap[curNode] = -1 this.nodeIndexMap[this.heap[0]] = 0 this.HeapDown(0) } else { this.Push(curNode) } } else { this.HeapDown(preIndex) } } func (this *TopK) topk() []string { heapCopy := make([]*Node, this.heapSize) copy(heapCopy, this.heap) sort.Slice(heapCopy, func(i, j int) bool { return !this.compare(heapCopy[i], heapCopy[j]) }) ans := make([]string, this.heapSize) for i := 0; i < this.heapSize; i++ { ans[i] = heapCopy[i].Str } return ans } type Node struct { Str string Times int } //索引上移,小根堆 func (this *TopK) HeapUp(index int) { for (index-1)/2 != index && !this.compare(this.heap[(index-1)/2], this.heap[index]) { //父节点小于当前节点,当前节点必须上移 this.heap[index], this.heap[(index-1)/2] = this.heap[(index-1)/2], this.heap[index] //加强堆 this.nodeIndexMap[this.heap[index]], this.nodeIndexMap[this.heap[(index-1)/2]] = (index-1)/2, index index = (index - 1) / 2 } } //索引下沉,小根堆 func (this *TopK) HeapDown(index int) { left := 2*index + 1 for left <= this.heapSize-1 { //左孩子存在 //获取小孩子 largest := left if left+1 <= this.heapSize-1 && this.compare(this.heap[left+1], this.heap[left]) { largest++ } //比较 if !this.compare(this.heap[index], this.heap[largest]) { //当前大于最小孩子,必须下沉 this.heap[index], this.heap[largest] = this.heap[largest], this.heap[index] //加强堆 this.nodeIndexMap[this.heap[index]], this.nodeIndexMap[this.heap[largest]] = largest, index } else { break } //下一次遍历 index = largest left = 2*index + 1 } } func (this *TopK) Push(node *Node) { this.heap[this.heapSize] = node //加强堆 this.nodeIndexMap[node] = this.heapSize //索引上移 this.HeapUp(this.heapSize) this.heapSize++ } func (this *TopK) Pop() *Node { ans := this.heap[0] this.heap[0], this.heap[this.heapSize-1] = this.heap[this.heapSize-1], this.heap[0] //加强堆 this.nodeIndexMap[this.heap[0]] = 0 this.nodeIndexMap[this.heap[this.heapSize-1]] = -1 this.heapSize-- //索引下沉 this.HeapDown(0) return ans } func (this *TopK) compare(node1 *Node, node2 *Node) bool { if node1.Times == node2.Times { return node1.Str > node2.Str } return node1.Times < node2.Times }
执行结果如下:
福大大 答案2021-05-29:
方法一:
redis的sorted set。hash+跳表实现计数和查找。无代码。
方法二:
节点结构体:有字符串和词频。
词频表:key是字符串,value是节点。
堆:节点数组。
反向表:key是节点,value是在堆中的索引。
有代码,但不完整,因为时间紧。
代码用golang编写。代码如下:
package main import "fmt" func main() { a := NewTopK(2) a.add("lint") a.add("code") a.add("code") fmt.Println(a.topk()) } type TopK struct { //堆 heap []*Node heapSize int //字,次数 wordNodeMap map[string]*Node //反向表 nodeIndexMap map[*Node]int } func NewTopK(k int) *TopK { ret := &TopK{} ret.heap = make([]*Node, k) return ret } func (this *TopK) add(word string) { if len(this.heap) == 0 { return } var curNode *Node preIndex := -1 curNode = this.wordNodeMap[word] if curNode == nil { curNode = &Node{word, 1} this.wordNodeMap[word] = curNode this.nodeIndexMap[curNode] = -1 } else { //tree set curNode.Times++ preIndex = this.nodeIndexMap[curNode] } if preIndex == -1 { if this.heapSize == len(this.heap) { //treeset } else { //tree add this.nodeIndexMap[curNode] = this.heapSize this.heap[this.heapSize] = curNode this.HeapUp(preIndex) } } else { //tree add this.HeapDown(preIndex) } } func (this *TopK) topk() []string { ans := make([]string, len(this.heap)) return ans } type Node struct { Str string Times int } //索引上移,大根堆 func (this *TopK) HeapUp(index int) { for this.heap[(index-1)/2].Times < this.heap[index].Times { //父节点小于当前节点,当前节点必须上移 this.heap[index], this.heap[(index-1)/2] = this.heap[(index-1)/2], this.heap[index] //加强堆 this.nodeIndexMap[this.heap[index]], this.nodeIndexMap[this.heap[(index-1)/2]] = (index-1)/2, index index = (index - 1) / 2 } } //索引下沉,大根堆 func (this *TopK) HeapDown(index int) { left := 2*index + 1 for left <= this.heapSize-1 { //左孩子存在 //获取大孩子 largest := left if left+1 <= this.heapSize-1 && this.heap[left+1].Times > this.heap[left].Times { largest++ } //比较 if this.heap[index].Times < this.heap[largest].Times { //当前小于最大孩子,必须下沉 this.heap[index], this.heap[largest] = this.heap[largest], this.heap[index] //加强堆 this.nodeIndexMap[this.heap[index]], this.nodeIndexMap[this.heap[largest]] = largest, index } else { break } //下一次遍历 index = largest left = 2*index + 1 } } func (this *TopK) Push(node *Node) { this.heap[this.heapSize] = node //加强堆 this.nodeIndexMap[node] = this.heapSize this.heapSize++ //索引上移 this.HeapUp(this.heapSize) } func (this *TopK) Pop() *Node { ans := this.heap[0] this.heap[0], this.heap[this.heapSize-1] = this.heap[this.heapSize-1], this.heap[0] //加强堆 this.nodeIndexMap[this.heap[0]] = 0 this.nodeIndexMap[this.heap[this.heapSize-1]] = -1 this.heapSize-- //索引下沉 this.HeapDown(0) return ans }
执行结果如下: