OpenFaas如何支持有状态计算? ----- 拓展过程量计算二次开发的实现

248 阅读18分钟

背景

目前在我们的业务中,运行一个边缘计算算法的时候,会以一个workflow的形式进行运行.也就是有向无环图DAG.

在DAG当前,每个节点代表了评估,识别,合并的算法插件。需要将上一个算法结果的输入带下来.

这在边缘端资源受限的情况下,是需要对资源做控制的。所以我们第一个方案考虑的是基于Openfaas做改造. 那在此之前,我们需要把DAG转成一个单向链表进行处理. 所以我们分为两步走

  • 把DAG转换成单向链表
  • OpenFaas实现kafka数据源
  • OpenFaas支持有状态计算

把DAG转换成单向链表

package main

import (
        "encoding/json"
        "errors"
        "fmt"
        "os"
)

// Vertex 表示图中的一个节点
type Vertex struct {
        ID   int       // 节点的唯一标识
        Next *Vertex   // 单向链表的下一个节点
        Data *NodeData // 节点数据
}

// Edge 表示图中的一条边
type Edge struct {
        ID     int // 边的唯一标识
        From   int // 起点ID
        To     int // 终点ID
        Weight int // 边的权重
}

// Graph 表示一个有向无环图(DAG)
type Graph struct {
        Vertices map[int]*Vertex
        Edges    map[int]*Edge
        AdjList  map[int]map[int]*Edge
}

// NodeData 节点数据结构,用于存储从JSON解析出的节点信息
type NodeData struct {
        ID          string      `json:"id"`
        Label       string      `json:"label"`
        IconType    string      `json:"iconType"`
        ClassName   string      `json:"className"`
        Top         string      `json:"top"`
        Left        string      `json:"left"`
        FlinkParam  FlinkParam  `json:"flink_param"`
        Endpoints   []Endpoints `json:"endpoints"`
}

// FlinkParam Flink参数结构
type FlinkParam struct {
        Flink  Flink  `json:"flink"`
        Plugin Plugin `json:"plugin"`
        Type   string `json:"type"`
}

// Flink Flink详细参数结构
type Flink struct {
        Watermark          Watermark          `json:"watermark"`
        Param              map[string]string  `json:"param"`
        SubscribeOutputTags []string           `json:"subscribe_output_tags"`
        Parallelism        int                `json:"parallelism"`
        KeyClassName       string             `json:"key_class_name"`
        OutputTagsRule     []OutputTagsRule   `json:"output_tags_rule"`
        Type               string             `json:"type"`
        ClassName          string             `json:"class_name"`
}

// Watermark 水印参数结构
type Watermark struct {
        Type      string `json:"type"`
        TimeField string `json:"time_field"`
        Duration  int    `json:"duration"`
}

// OutputTagsRule 输出标签规则结构
type OutputTagsRule struct {
        Name string `json:"name"`
        Rule string `json:"rule"`
}

// Plugin 插件参数结构
type Plugin struct {
        Param     map[string]string `json:"param"`
        Type      string            `json:"type"`
        ClassName string            `json:"class_name"`
}

// Endpoints 连接点结构
type Endpoints struct {
        ID          string `json:"id"`
        Orientation []int  `json:"orientation"`
        Pos         []int  `json:"pos"`
}

// JSONData 存储从JSON解析出的节点数组
type JSONData struct {
        Nodes []NodeData `json:"nodes"`
}

// NewGraph 创建一个新的图
func NewGraph() *Graph {
        return &Graph{
                Vertices: make(map[int]*Vertex),
                Edges:    make(map[int]*Edge),
                AdjList:  make(map[int]map[int]*Edge),
        }
}

// AddVertex 添加一个节点
func (g *Graph) AddVertex(id int, data *NodeData) error {
        if _, exists := g.Vertices[id]; exists {
                return errors.New("vertex already exists")
        }
        g.Vertices[id] = &Vertex{ID: id, Data: data}
        g.AdjList[id] = make(map[int]*Edge)
        return nil
}

// AddEdge 添加一条边
func (g *Graph) AddEdge(id, from, to, weight int) error {
        if _, exists := g.Vertices[from]; !exists {
                return errors.New("source vertex does not exist")
        }
        if _, exists := g.Vertices[to]; !exists {
                return errors.New("target vertex does not exist")
        }
        if _, exists := g.Edges[id]; exists {
                return errors.New("edge already exists")
        }
        if g.hasCycle(from, to) {
                return errors.New("adding this edge would create a cycle")
        }
        edge := &Edge{
                ID:     id,
                From:   from,
                To:     to,
                Weight: weight,
        }
        g.Edges[id] = edge
        g.AdjList[from][to] = edge
        return nil
}

// hasCycle 检查添加边是否会形成环
func (g *Graph) hasCycle(from, to int) bool {
        visited := make(map[int]bool)
        stack := make(map[int]bool)
        return g.detectCycle(to, visited, stack)
}

// detectCycle 使用DFS检测环
func (g *Graph) detectCycle(node int, visited, stack map[int]bool) bool {
        if stack[node] {
                return true
        }
        if visited[node] {
                return false
        }
        visited[node] = true
        stack[node] = true
        for neighbor := range g.AdjList[node] {
                if g.detectCycle(neighbor, visited, stack) {
                        return true
                }
        }
        stack[node] = false
        return false
}

// 解析JSON文件并构建图
func BuildGraphFromJSON(filePath string) (*Graph, error) {
        // 读取JSON文件
        data, err := os.ReadFile(filePath)
        if err != nil {
                return nil, err
        }

        // 解析JSON数据
        var jsonData JSONData
        err = json.Unmarshal(data, &jsonData)
        if err != nil {
                return nil, err
        }

        // 创建图
        graph := NewGraph()

        // 添加节点
        for _, node := range jsonData.Nodes {
                id, _ := fmt.Sscanf(node.ID, "%d")
                graph.AddVertex(id, &node)
        }

        // 添加边(根据输出标签和输入标签)
        for _, fromNode := range jsonData.Nodes {
                fromID, _ := fmt.Sscanf(fromNode.ID, "%d")
                for _, toNode := range jsonData.Nodes {
                        toID, _ := fmt.Sscanf(toNode.ID, "%d")
                        for _, outputTag := range fromNode.FlinkParam.Flink.OutputTagsRule {
                                for _, inputTag := range toNode.FlinkParam.Flink.SubscribeOutputTags {
                                        if outputTag.Name == inputTag {
                                                graph.AddEdge(len(graph.Edges)+1, fromID, toID, 1)
                                        }
                                }
                        }
                }
        }

        return graph, nil
}

// TopologicalSort 返回图的拓扑排序
func (g *Graph) TopologicalSort() ([]int, error) {
        // 计算每个节点的入度
        inDegree := make(map[int]int)
        for id := range g.Vertices {
                inDegree[id] = 0
        }
        for _, edge := range g.Edges {
                inDegree[edge.To]++
        }

        // 找到入度为 0 的节点(起点)
        queue := make([]int, 0)
        for id, degree := range inDegree {
                if degree == 0 {
                        queue = append(queue, id)
                }
        }

        // 拓扑排序
        result := make([]int, 0)
        for len(queue) > 0 {
                current := queue[0]
                queue = queue[1:]
                result = append(result, current)

                // 减少邻居节点的入度
                for neighbor := range g.AdjList[current] {
                        inDegree[neighbor]--
                        if inDegree[neighbor] == 0 {
                                queue = append(queue, neighbor)
                        }
                }
        }

        // 检查是否有环
        if len(result) != len(g.Vertices) {
                return nil, errors.New("graph has a cycle")
        }

        return result, nil
}
 //ToLinkedList 将图转换为单向链表
func (g *Graph) ToLinkedList() (*Vertex, error) {
        // 获取拓扑排序
        sorted, err := g.TopologicalSort()
        if err != nil {
                return sorted
        }
        return nil
}

OpenFaas实现kafka数据源

基于OpenFaaS现有的数据源架构,创建一个Kafka数据源实现,遵循与现有数据源相同的接口模式。

OpenFaas现有架构

sequenceDiagram    
    participant Client as "客户端"    
    participant Gateway as "OpenFaaS Gateway"    
    participant Provider as "Function Provider"    
    participant Function as "Function实例"    
    participant NATS as "NATS队列"    
    participant Prometheus as "Prometheus"    
    participant UI as "Web UI"    
    
    Note over Client,UI: 1. 同步函数调用流程    
    Client->>Gateway: HTTP请求 /function/{name}    
    Gateway->>Provider: 转发请求到Provider    
    Provider->>Function: 路由到函数实例    
    Function->>Provider: 返回响应    
    Provider->>Gateway: 返回响应    
    Gateway->>Client: 返回最终响应    
    Gateway->>Prometheus: 发送指标数据    
    
    Note over Client,UI: 2. 异步函数调用流程    
    Client->>Gateway: HTTP请求 /async-function/{name}    
    Gateway->>NATS: 将请求加入队列    
    Gateway->>Client: 返回202 Accepted    
    NATS->>Provider: Queue Worker处理队列请求    
    Provider->>Function: 执行函数    
    Function->>Provider: 返回结果    
    opt 如果有回调URL    
        Provider->>Client: 回调结果    
    end    
    
    Note over Client,UI: 3. 函数管理流程    
    Client->>Gateway: 部署函数 POST /system/functions    
    Gateway->>Provider: 转发部署请求    
    Provider->>Provider: 创建函数实例    
    Provider->>Gateway: 返回部署状态    
    Gateway->>Client: 返回结果    
    
    Note over Client,UI: 4. 监控和扩缩容    
    Gateway->>Prometheus: 持续发送指标    
    Provider->>Prometheus: 发送函数指标    
    Gateway->>Provider: 根据负载进行扩缩容    
    Provider->>Function: 创建/销毁实例    
    
    Note over Client,UI: 5. Web UI交互    
    UI->>Gateway: 获取函数列表 /system/functions    
    Gateway->>Provider: 转发请求    
    Provider->>Gateway: 返回函数信息    
    Gateway->>UI: 返回函数列表

1. Kafka数据源接口定义

首先,我们需要定义Kafka数据源的接口,类似于现有的 service_query.go:8-12 :

// gateway/kafka/kafka_source.go  
package kafka  
  
import (  
    "context"  
    "log"  
    "time"  
      
    "github.com/Shopify/sarama"  
    "github.com/openfaas/faas/gateway/scaling"  
)  
  
// KafkaDataSource implements a Kafka-based data source for OpenFaaS  
type KafkaDataSource struct {  
    consumer    sarama.Consumer  
    brokers     []string  
    topics      []string  
    groupID     string  
    credentials *KafkaCredentials  
}  
  
type KafkaCredentials struct {  
    Username string  
    Password string  
    UseSASL  bool  
}  
  
type KafkaSourceConfig struct {  
    Brokers     []string  
    Topics      []string  
    GroupID     string  
    Credentials *KafkaCredentials  
}

2. Kafka数据源核心实现

// NewKafkaDataSource creates a new Kafka data source  
func NewKafkaDataSource(config KafkaSourceConfig) (*KafkaDataSource, error) {  
    saramaConfig := sarama.NewConfig()  
    saramaConfig.Consumer.Group.Rebalance.Strategy = sarama.BalanceStrategyRoundRobin  
    saramaConfig.Consumer.Offsets.Initial = sarama.OffsetNewest  
      
    if config.Credentials != nil && config.Credentials.UseSASL {  
        saramaConfig.Net.SASL.Enable = true  
        saramaConfig.Net.SASL.User = config.Credentials.Username  
        saramaConfig.Net.SASL.Password = config.Credentials.Password  
        saramaConfig.Net.SASL.Mechanism = sarama.SASLTypePlaintext  
    }  
      
    consumer, err := sarama.NewConsumer(config.Brokers, saramaConfig)  
    if err != nil {  
        return nil, err  
    }  
      
    return &KafkaDataSource{  
        consumer:    consumer,  
        brokers:     config.Brokers,  
        topics:      config.Topics,  
        groupID:     config.GroupID,  
        credentials: config.Credentials,  
    }, nil  
}  
  
// StartMessageConsumer starts consuming messages from Kafka topics  
func (k *KafkaDataSource) StartMessageConsumer(ctx context.Context, messageHandler func([]byte) error) error {  
    for _, topic := range k.topics {  
        partitions, err := k.consumer.Partitions(topic)  
        if err != nil {  
            log.Printf("Error getting partitions for topic %s: %v", topic, err)  
            continue  
        }  
          
        for _, partition := range partitions {  
            go k.consumePartition(ctx, topic, partition, messageHandler)  
        }  
    }  
      
    return nil  
}  
  
func (k *KafkaDataSource) consumePartition(ctx context.Context, topic string, partition int32, messageHandler func([]byte) error) {  
    partitionConsumer, err := k.consumer.ConsumePartition(topic, partition, sarama.OffsetNewest)  
    if err != nil {  
        log.Printf("Error creating partition consumer: %v", err)  
        return  
    }  
    defer partitionConsumer.Close()  
      
    for {  
        select {  
        case message := <-partitionConsumer.Messages():  
            if message != nil {  
                if err := messageHandler(message.Value); err != nil {  
                    log.Printf("Error handling message: %v", err)  
                }  
            }  
        case err := <-partitionConsumer.Errors():  
            log.Printf("Kafka consumer error: %v", err)  
        case <-ctx.Done():  
            return  
        }  
    }  
}  
  
// Close closes the Kafka consumer  
func (k *KafkaDataSource) Close() error {  
    return k.consumer.Close()  
}

3. 集成到OpenFaaS Gateway

现在我们需要将Kafka数据源集成到OpenFaaS的主要架构中,类似于现有的 main.go:106-119

// gateway/kafka/kafka_integration.go  
package kafka  
  
import (  
    "context"  
    "encoding/json"  
    "log"  
    "net/http"  
    "time"  
      
    "github.com/openfaas/faas/gateway/scaling"  
    "github.com/openfaas/faas/gateway/types"  
)  
  
// KafkaFunctionTrigger handles Kafka messages and triggers function invocations  
type KafkaFunctionTrigger struct {  
    kafkaSource     *KafkaDataSource  
    functionInvoker FunctionInvoker  
    ctx             context.Context  
    cancel          context.CancelFunc  
}  
  
type FunctionInvoker interface {  
    InvokeFunction(functionName string, namespace string, body []byte) error  
}  
  
type KafkaMessage struct {  
    FunctionName string          `json:"function_name"`  
    Namespace    string          `json:"namespace"`  
    Payload      json.RawMessage `json:"payload"`  
    Headers      map[string]string `json:"headers,omitempty"`  
}  
  
// NewKafkaFunctionTrigger creates a new Kafka function trigger  
func NewKafkaFunctionTrigger(config KafkaSourceConfig, invoker FunctionInvoker) (*KafkaFunctionTrigger, error) {  
    kafkaSource, err := NewKafkaDataSource(config)  
    if err != nil {  
        return nil, err  
    }  
      
    ctx, cancel := context.WithCancel(context.Background())  
      
    trigger := &KafkaFunctionTrigger{  
        kafkaSource:     kafkaSource,  
        functionInvoker: invoker,  
        ctx:             ctx,  
        cancel:          cancel,  
    }  
      
    return trigger, nil  
}  
  
// Start begins consuming Kafka messages and triggering functions  
func (k *KafkaFunctionTrigger) Start() error {  
    return k.kafkaSource.StartMessageConsumer(k.ctx, k.handleMessage)  
}  
  
func (k *KafkaFunctionTrigger) handleMessage(messageBytes []byte) error {  
    var kafkaMsg KafkaMessage  
    if err := json.Unmarshal(messageBytes, &kafkaMsg); err != nil {  
        log.Printf("Error unmarshaling Kafka message: %v", err)  
        return err  
    }  
      
    // Invoke the specified function  
    return k.functionInvoker.InvokeFunction(kafkaMsg.FunctionName, kafkaMsg.Namespace, kafkaMsg.Payload)  
}  
  
// Stop stops the Kafka consumer  
func (k *KafkaFunctionTrigger) Stop() error {  
    k.cancel()  
    return k.kafkaSource.Close()  
}

4. HTTP函数调用器实现

// gateway/kafka/function_invoker.go  
package kafka  
  
import (  
    "bytes"  
    "fmt"  
    "io"  
    "net/http"  
    "time"  
)  
  
// HTTPFunctionInvoker implements FunctionInvoker using HTTP calls  
type HTTPFunctionInvoker struct {  
    gatewayURL string  
    client     *http.Client  
    timeout    time.Duration  
}  
  
// NewHTTPFunctionInvoker creates a new HTTP-based function invoker  
func NewHTTPFunctionInvoker(gatewayURL string, timeout time.Duration) *HTTPFunctionInvoker {  
    return &HTTPFunctionInvoker{  
        gatewayURL: gatewayURL,  
        client: &http.Client{  
            Timeout: timeout,  
        },  
        timeout: timeout,  
    }  
}  
  
// InvokeFunction invokes a function via HTTP  
func (h *HTTPFunctionInvoker) InvokeFunction(functionName string, namespace string, body []byte) error {  
    var url string  
    if namespace != "" {  
        url = fmt.Sprintf("%s/function/%s.%s", h.gatewayURL, functionName, namespace)  
    } else {  
        url = fmt.Sprintf("%s/function/%s", h.gatewayURL, functionName)  
    }  
      
    req, err := http.NewRequest("POST", url, bytes.NewReader(body))  
    if err != nil {  
        return fmt.Errorf("error creating request: %v", err)  
    }  
      
    req.Header.Set("Content-Type", "application/json")  
    req.Header.Set("X-Kafka-Trigger", "true")  
      
    resp, err := h.client.Do(req)  
    if err != nil {  
        return fmt.Errorf("error invoking function: %v", err)  
    }  
    defer resp.Body.Close()  
      
    if resp.StatusCode >= 400 {  
        body, _ := io.ReadAll(resp.Body)  
        return fmt.Errorf("function invocation failed with status %d: %s", resp.StatusCode, string(body))  
    }  
      
    return nil  
}

5. 集成到主Gateway

最后,我们需要修改 main.go:70-73 来包含Kafka数据源:

// 在 gateway/main.go 中添加  
import (  
    "github.com/openfaas/faas/gateway/kafka"  
)  
  
// 在main函数中添加Kafka配置和启动逻辑  
func main() {  
    // ... 现有代码 ...  
      
    // Kafka配置  
    if config.UseKafka() {  
        log.Println("Kafka trigger enabled")  
          
        kafkaConfig := kafka.KafkaSourceConfig{  
            Brokers: config.KafkaBrokers,  
            Topics:  config.KafkaTopics,  
            GroupID: config.KafkaGroupID,  
            Credentials: &kafka.KafkaCredentials{  
                Username: config.KafkaUsername,  
                Password: config.KafkaPassword,  
                UseSASL:  config.KafkaUseSASL,  
            },  
        }  
          
        functionInvoker := kafka.NewHTTPFunctionInvoker(  
            config.FunctionsProviderURL.String(),  
            config.UpstreamTimeout,  
        )  
          
        kafkaTrigger, err := kafka.NewKafkaFunctionTrigger(kafkaConfig, functionInvoker)  
        if err != nil {  
            log.Fatalf("Failed to create Kafka trigger: %v", err)  
        }  
          
        go func() {  
            if err := kafkaTrigger.Start(); err != nil {  
                log.Printf("Kafka trigger error: %v", err)  
            }  
        }()  
          
        // 优雅关闭  
        defer kafkaTrigger.Stop()  
    }  
      
    // ... 现有代码继续 ...  
}

使用示例

要使用这个Kafka数据源,需要:

1.  配置环境变量:

export kafka_brokers="localhost:9092"  
export kafka_topics="openfaas-triggers"  
export kafka_group_id="openfaas-gateway"  
export kafka_username="your-username"  
export kafka_password="your-password"  
export kafka_use_sasl="true"

2.  发送Kafka消息:

{  
  "function_name": "echo",  
  "namespace": "openfaas-fn",  
  "payload": {"message": "Hello from Kafka!"},  
  "headers": {"content-type": "application/json"}  
}

这个实现基于OpenFaaS现有的架构模式,参考了 exporter.go:80-122 中的服务监控模式和 queue_proxy.go:24-68 中的异步处理模式。提供了一个Kafka数据源实现,可以无缝集成到OpenFaaS Gateway中,支持通过Kafka消息触发函数执行

OpenFaas支持有状态计算

实现Python计算任务的中间变量存储系统,利用OpenFaaS现有的NATS KeyValue存储接口和插件架构。

前置知识

Nats是OpenFaas的一个存储接口

NATS存储类型

● 1. 文件存储 (FileStorage)

这是NATS JetStream的默认存储类型,数据持久化到磁盘上。适用于需要数据持久性的场景。

● 2. 内存存储 (MemoryStorage)

数据仅存储在内存中,提供更快的访问速度但不具备持久性。

KeyValue存储可以通过以下配置进行定制: kv.go:250-268

1. 存储管理器实现

首先,基于现有的NATS KeyValue接口 ,实现一个Python任务存储管理器:

// gateway/storage/task_storage.go  
package storage  
  
import (  
    "context"  
    "encoding/json"  
    "fmt"  
    "log"  
    "time"  
      
    "github.com/nats-io/nats.go"  
)  
  
// PythonTaskStorage manages intermediate variables for Python computation tasks  
type PythonTaskStorage struct {  
    kv          nats.KeyValue  
    js          nats.JetStreamContext  
    taskBucket  string  
    varBucket   string  
}  
  
type TaskVariable struct {  
    TaskID      string      `json:"task_id"`  
    VarName     string      `json:"var_name"`  
    VarType     string      `json:"var_type"`  
    Value       interface{} `json:"value"`  
    Timestamp   time.Time   `json:"timestamp"`  
    TTL         time.Duration `json:"ttl,omitempty"`  
}  
  
type TaskMetadata struct {  
    TaskID      string            `json:"task_id"`  
    FunctionName string           `json:"function_name"`  
    Status      string            `json:"status"`  
    Variables   map[string]string `json:"variables"` // var_name -> key mapping  
    CreatedAt   time.Time         `json:"created_at"`  
    UpdatedAt   time.Time         `json:"updated_at"`  
}  
  
// NewPythonTaskStorage creates a new task storage manager  
func NewPythonTaskStorage(nc *nats.Conn) (*PythonTaskStorage, error) {  
    js, err := nc.JetStream()  
    if err != nil {  
        return nil, fmt.Errorf("failed to get JetStream context: %v", err)  
    }  
      
    // Create task metadata bucket  
    taskKV, err := js.CreateKeyValue(&nats.KeyValueConfig{  
        Bucket:      "python-tasks",  
        Description: "Python task metadata storage",  
        TTL:         24 * time.Hour,  
        History:     5,  
        Storage:     nats.FileStorage,  
    })  
    if err != nil {  
        // Try to get existing bucket  
        taskKV, err = js.KeyValue("python-tasks")  
        if err != nil {  
            return nil, fmt.Errorf("failed to create/get task bucket: %v", err)  
        }  
    }  
      
    // Create variables bucket  
    varKV, err := js.CreateKeyValue(&nats.KeyValueConfig{  
        Bucket:      "python-variables",  
        Description: "Python task variables storage",  
        TTL:         24 * time.Hour,  
        History:     10,  
        Storage:     nats.FileStorage,  
    })  
    if err != nil {  
        varKV, err = js.KeyValue("python-variables")  
        if err != nil {  
            return nil, fmt.Errorf("failed to create/get variables bucket: %v", err)  
        }  
    }  
      
    return &PythonTaskStorage{  
        kv:         taskKV,  
        js:         js,  
        taskBucket: "python-tasks",  
        varBucket:  "python-variables",  
    }, nil  
}  
  
// CreateTask creates a new Python computation task  
func (pts *PythonTaskStorage) CreateTask(taskID, functionName string) error {  
    metadata := TaskMetadata{  
        TaskID:       taskID,  
        FunctionName: functionName,  
        Status:       "created",  
        Variables:    make(map[string]string),  
        CreatedAt:    time.Now(),  
        UpdatedAt:    time.Now(),  
    }  
      
    data, err := json.Marshal(metadata)  
    if err != nil {  
        return fmt.Errorf("failed to marshal task metadata: %v", err)  
    }  
      
    _, err = pts.kv.Put(taskID, data)  
    return err  
}  
  
// StoreVariable stores an intermediate variable for a task  
func (pts *PythonTaskStorage) StoreVariable(taskID, varName string, value interface{}, varType string, ttl time.Duration) error {  
    variable := TaskVariable{  
        TaskID:    taskID,  
        VarName:   varName,  
        VarType:   varType,  
        Value:     value,  
        Timestamp: time.Now(),  
        TTL:       ttl,  
    }  
      
    data, err := json.Marshal(variable)  
    if err != nil {  
        return fmt.Errorf("failed to marshal variable: %v", err)  
    }  
      
    varKey := fmt.Sprintf("%s:%s", taskID, varName)  
      
    // Store in variables bucket  
    varKV, err := pts.js.KeyValue(pts.varBucket)  
    if err != nil {  
        return fmt.Errorf("failed to get variables bucket: %v", err)  
    }  
      
    _, err = varKV.Put(varKey, data)  
    if err != nil {  
        return fmt.Errorf("failed to store variable: %v", err)  
    }  
      
    // Update task metadata  
    return pts.updateTaskVariables(taskID, varName, varKey)  
}  
  
// GetVariable retrieves an intermediate variable  
func (pts *PythonTaskStorage) GetVariable(taskID, varName string) (*TaskVariable, error) {  
    varKey := fmt.Sprintf("%s:%s", taskID, varName)  
      
    varKV, err := pts.js.KeyValue(pts.varBucket)  
    if err != nil {  
        return nil, fmt.Errorf("failed to get variables bucket: %v", err)  
    }  
      
    entry, err := varKV.Get(varKey)  
    if err != nil {  
        return nil, fmt.Errorf("failed to get variable: %v", err)  
    }  
      
    var variable TaskVariable  
    if err := json.Unmarshal(entry.Value(), &variable); err != nil {  
        return nil, fmt.Errorf("failed to unmarshal variable: %v", err)  
    }  
      
    return &variable, nil  
}  
  
func (pts *PythonTaskStorage) updateTaskVariables(taskID, varName, varKey string) error {  
    entry, err := pts.kv.Get(taskID)  
    if err != nil {  
        return fmt.Errorf("failed to get task metadata: %v", err)  
    }  
      
    var metadata TaskMetadata  
    if err := json.Unmarshal(entry.Value(), &metadata); err != nil {  
        return fmt.Errorf("failed to unmarshal task metadata: %v", err)  
    }  
      
    metadata.Variables[varName] = varKey  
    metadata.UpdatedAt = time.Now()  
      
    data, err := json.Marshal(metadata)  
    if err != nil {  
        return fmt.Errorf("failed to marshal updated metadata: %v", err)  
    }  
      
    _, err = pts.kv.Update(taskID, data, entry.Revision())  
    return err  
}

2. HTTP API处理器

基于现有的外部服务查询架构 ,实现存储API处理器:

// gateway/storage/storage_handler.go  
package storage  
  
import (  
    "encoding/json"  
    "fmt"  
    "net/http"  
    "time"  
      
    "github.com/gorilla/mux"  
    "github.com/openfaas/faas/gateway/middleware"  
)  
  
// StorageHandler handles HTTP requests for task storage  
type StorageHandler struct {  
    storage      *PythonTaskStorage  
    authInjector middleware.AuthInjector  
}  
  
// NewStorageHandler creates a new storage handler  
func NewStorageHandler(storage *PythonTaskStorage, authInjector middleware.AuthInjector) *StorageHandler {  
    return &StorageHandler{  
        storage:      storage,  
        authInjector: authInjector,  
    }  
}  
  
// StoreVariableRequest represents a variable storage request  
type StoreVariableRequest struct {  
    TaskID   string      `json:"task_id"`  
    VarName  string      `json:"var_name"`  
    VarType  string      `json:"var_type"`  
    Value    interface{} `json:"value"`  
    TTL      string      `json:"ttl,omitempty"`  
}  
  
// GetVariableResponse represents a variable retrieval response  
type GetVariableResponse struct {  
    TaskID    string      `json:"task_id"`  
    VarName   string      `json:"var_name"`  
    VarType   string      `json:"var_type"`  
    Value     interface{} `json:"value"`  
    Timestamp time.Time   `json:"timestamp"`  
}  
  
// HandleStoreVariable handles variable storage requests  
func (sh *StorageHandler) HandleStoreVariable(w http.ResponseWriter, r *http.Request) {  
    if r.Method != http.MethodPost {  
        http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)  
        return  
    }  
      
    var req StoreVariableRequest  
    if err := json.NewDecoder(r.Body).Decode(&req); err != nil {  
        http.Error(w, fmt.Sprintf("Invalid request body: %v", err), http.StatusBadRequest)  
        return  
    }  
      
    // Parse TTL if provided  
    var ttl time.Duration  
    if req.TTL != "" {  
        var err error  
        ttl, err = time.ParseDuration(req.TTL)  
        if err != nil {  
            http.Error(w, fmt.Sprintf("Invalid TTL format: %v", err), http.StatusBadRequest)  
            return  
        }  
    }  
      
    // Store the variable  
    if err := sh.storage.StoreVariable(req.TaskID, req.VarName, req.Value, req.VarType, ttl); err != nil {  
        http.Error(w, fmt.Sprintf("Failed to store variable: %v", err), http.StatusInternalServerError)  
        return  
    }  
      
    w.WriteHeader(http.StatusCreated)  
    json.NewEncoder(w).Encode(map[string]string{"status": "stored"})  
}  
  
// HandleGetVariable handles variable retrieval requests  
func (sh *StorageHandler) HandleGetVariable(w http.ResponseWriter, r *http.Request) {  
    if r.Method != http.MethodGet {  
        http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)  
        return  
    }  
      
    vars := mux.Vars(r)  
    taskID := vars["taskId"]  
    varName := vars["varName"]  
      
    if taskID == "" || varName == "" {  
        http.Error(w, "Missing taskId or varName", http.StatusBadRequest)  
        return  
    }  
      
    variable, err := sh.storage.GetVariable(taskID, varName)  
    if err != nil {  
        http.Error(w, fmt.Sprintf("Failed to get variable: %v", err), http.StatusNotFound)  
        return  
    }  
      
    response := GetVariableResponse{  
        TaskID:    variable.TaskID,  
        VarName:   variable.VarName,  
        VarType:   variable.VarType,  
        Value:     variable.Value,  
        Timestamp: variable.Timestamp,  
    }  
      
    w.Header().Set("Content-Type", "application/json")  
    json.NewEncoder(w).Encode(response)  
}  
  
// HandleCreateTask handles task creation requests  
func (sh *StorageHandler) HandleCreateTask(w http.ResponseWriter, r *http.Request) {  
    if r.Method != http.MethodPost {  
        http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)  
        return  
    }  
      
    var req struct {  
        TaskID       string `json:"task_id"`  
        FunctionName string `json:"function_name"`  
    }  
      
    if err := json.NewDecoder(r.Body).Decode(&req); err != nil {  
        http.Error(w, fmt.Sprintf("Invalid request body: %v", err), http.StatusBadRequest)  
        return  
    }  
      
    if err := sh.storage.CreateTask(req.TaskID, req.FunctionName); err != nil {  
        http.Error(w, fmt.Sprintf("Failed to create task: %v", err), http.StatusInternalServerError)  
        return  
    }  
      
    w.WriteHeader(http.StatusCreated)  
    json.NewEncoder(w).Encode(map[string]string{"status": "created"})  
}

3. 集成到主Gateway

基于现有的缓存和查询架构 ,将存储系统集成到主Gateway:

// 在 gateway/main.go 中添加存储系统初始化  
import (  
    "github.com/openfaas/faas/gateway/storage"  
    "github.com/nats-io/nats.go"  
)  
  
func main() {  
    // ... 现有代码 ...  
      
    // 初始化NATS连接  
    natsURL := os.Getenv("nats_url")  
    if natsURL == "" {  
        natsURL = "nats://localhost:4222"  
    }  
      
    nc, err := nats.Connect(natsURL)  
    if err != nil {  
        log.Fatalf("Failed to connect to NATS: %v", err)  
    }  
    defer nc.Close()  
      
    // 初始化Python任务存储  
    taskStorage, err := storage.NewPythonTaskStorage(nc)  
    if err != nil {  
        log.Fatalf("Failed to initialize task storage: %v", err)  
    }  
      
    // 创建存储处理器  
    storageHandler := storage.NewStorageHandler(taskStorage, serviceAuthInjector)  
      
    // 添加存储API路由  
    r.HandleFunc("/system/storage/tasks", storageHandler.HandleCreateTask)  
    r.HandleFunc("/system/storage/variables", storageHandler.HandleStoreVariable)  
    r.HandleFunc("/system/storage/variables/{taskId}/{varName}", storageHandler.HandleGetVariable)  
      
    // ... 现有代码继续 ...  
}

4. Python函数客户端库

# python_client/openfaas_storage.py  
import json  
import requests  
import os  
import pickle  
import base64  
from typing import Any, Optional, Dict, Union  
from datetime import datetime  
import logging  
  
logger = logging.getLogger(__name__)  
  
class OpenFaaSStorage:  
    """OpenFaaS存储客户端,用于Python函数中管理中间变量"""  
      
    def __init__(self, gateway_url: Optional[str] = None, task_id: Optional[str] = None):  
        self.gateway_url = gateway_url or os.getenv('OPENFAAS_GATEWAY_URL', 'http://gateway:8080')  
        self.task_id = task_id or os.getenv('TASK_ID')  
        if not self.task_id:  
            raise ValueError("TASK_ID environment variable is required")  
          
        # 设置HTTP会话  
        self.session = requests.Session()  
        self.session.headers.update({  
            'Content-Type': 'application/json',  
            'User-Agent': 'OpenFaaS-Python-Storage-Client/1.0'  
        })  
          
        # 如果有认证信息,添加到会话中  
        auth_user = os.getenv('OPENFAAS_AUTH_USER')  
        auth_pass = os.getenv('OPENFAAS_AUTH_PASS')  
        if auth_user and auth_pass:  
            self.session.auth = (auth_user, auth_pass)  
      
    def create_task(self, function_name: str) -> bool:  
        """创建新的计算任务"""  
        url = f"{self.gateway_url}/system/storage/tasks"  
        payload = {  
            "task_id": self.task_id,  
            "function_name": function_name  
        }  
          
        try:  
            response = self.session.post(url, json=payload, timeout=10)  
            response.raise_for_status()  
            logger.info(f"Task {self.task_id} created successfully")  
            return True  
        except requests.RequestException as e:  
            logger.error(f"Failed to create task: {e}")  
            return False  
      
    def store_variable(self, var_name: str, value: Any, var_type: Optional[str] = None,   
                      ttl: Optional[str] = None, serialize: bool = True) -> bool:  
        """存储中间变量到外部存储"""  
        url = f"{self.gateway_url}/system/storage/variables"  
          
        # 自动检测变量类型  
        if var_type is None:  
            var_type = type(value).__name__  
          
        # 序列化复杂对象  
        if serialize and not isinstance(value, (str, int, float, bool, type(None))):  
            try:  
                # 使用pickle序列化,然后base64编码  
                serialized = pickle.dumps(value)  
                encoded_value = base64.b64encode(serialized).decode('utf-8')  
                var_type = f"pickled_{var_type}"  
                value = encoded_value  
            except Exception as e:  
                logger.error(f"Failed to serialize variable {var_name}: {e}")  
                return False  
          
        payload = {  
            "task_id": self.task_id,  
            "var_name": var_name,  
            "var_type": var_type,  
            "value": value  
        }  
          
        if ttl:  
            payload["ttl"] = ttl  
          
        try:  
            response = self.session.post(url, json=payload, timeout=10)  
            response.raise_for_status()  
            logger.info(f"Variable {var_name} stored successfully")  
            return True  
        except requests.RequestException as e:  
            logger.error(f"Failed to store variable {var_name}: {e}")  
            return False  
      
    def get_variable(self, var_name: str, deserialize: bool = True) -> Optional[Any]:  
        """从外部存储获取中间变量"""  
        url = f"{self.gateway_url}/system/storage/variables/{self.task_id}/{var_name}"  
          
        try:  
            response = self.session.get(url, timeout=10)  
            response.raise_for_status()  
              
            data = response.json()  
            value = data.get('value')  
            var_type = data.get('var_type', '')  
              
            # 反序列化pickled对象  
            if deserialize and var_type.startswith('pickled_'):  
                try:  
                    decoded = base64.b64decode(value.encode('utf-8'))  
                    value = pickle.loads(decoded)  
                except Exception as e:  
                    logger.error(f"Failed to deserialize variable {var_name}: {e}")  
                    return None  
              
            logger.info(f"Variable {var_name} retrieved successfully")  
            return value  
              
        except requests.RequestException as e:  
            logger.error(f"Failed to get variable {var_name}: {e}")  
            return None  
      
    def store_dataframe(self, var_name: str, df, ttl: Optional[str] = None) -> bool:  
        """专门用于存储pandas DataFrame"""  
        try:  
            import pandas as pd  
            if isinstance(df, pd.DataFrame):  
                # 转换为JSON格式存储  
                json_data = df.to_json(orient='records')  
                return self.store_variable(var_name, json_data, 'dataframe_json', ttl, serialize=False)  
        except ImportError:  
            logger.warning("pandas not available, falling back to pickle serialization")  
          
        return self.store_variable(var_name, df, 'dataframe', ttl)  
      
    def get_dataframe(self, var_name: str):  
        """专门用于获取pandas DataFrame"""  
        try:  
            import pandas as pd  
            data = self.get_variable(var_name, deserialize=False)  
            if data is None:  
                return None  
              
            # 如果是JSON格式的DataFrame  
            if isinstance(data, dict) and data.get('var_type') == 'dataframe_json':  
                return pd.read_json(data['value'], orient='records')  
              
            # 否则尝试反序列化  
            return self.get_variable(var_name, deserialize=True)  
              
        except ImportError:  
            logger.warning("pandas not available")  
            return self.get_variable(var_name, deserialize=True)  
      
    def store_numpy_array(self, var_name: str, array, ttl: Optional[str] = None) -> bool:  
        """专门用于存储numpy数组"""  
        try:  
            import numpy as np  
            if isinstance(array, np.ndarray):  
                # 转换为列表存储  
                array_data = {  
                    'data': array.tolist(),  
                    'shape': array.shape,  
                    'dtype': str(array.dtype)  
                }  
                return self.store_variable(var_name, array_data, 'numpy_array', ttl, serialize=False)  
        except ImportError:  
            logger.warning("numpy not available, falling back to pickle serialization")  
          
        return self.store_variable(var_name, array, 'numpy_array', ttl)  
      
    def get_numpy_array(self, var_name: str):  
        """专门用于获取numpy数组"""  
        try:  
            import numpy as np  
            data = self.get_variable(var_name, deserialize=False)  
            if data is None:  
                return None  
              
            # 如果是结构化的numpy数据  
            if isinstance(data, dict) and 'data' in data and 'shape' in data:  
                return np.array(data['data'], dtype=data.get('dtype')).reshape(data['shape'])  
              
            # 否则尝试反序列化  
            return self.get_variable(var_name, deserialize=True)  
              
        except ImportError:  
            logger.warning("numpy not available")  
            return self.get_variable(var_name, deserialize=True)  
  
# 托管存储装饰器  
def with_storage(function_name: str = None):  
    """装饰器,自动为函数提供存储功能"""  
    def decorator(func):  
        def wrapper(*args, **kwargs):  
            # 创建存储实例  
            storage = OpenFaaSStorage()  
              
            # 创建任务  
            func_name = function_name or func.__name__  
            storage.create_task(func_name)  
              
            # 将storage注入到函数参数中  
            kwargs['storage'] = storage  
              
            return func(*args, **kwargs)  
        return wrapper  
    return decorator

5. Python函数示例

# example_function/handler.py  
import json  
import numpy as np  
import pandas as pd  
from openfaas_storage import OpenFaaSStorage, with_storage  
  
@with_storage("data-processing")  
def handle(req, storage: OpenFaaSStorage):  
    """示例Python计算函数,展示如何使用存储系统"""  
      
    try:  
        # 解析输入  
        input_data = json.loads(req) if isinstance(req, str) else req  
        step = input_data.get('step', 'start')  
          
        if step == 'start':  
            # 第一步:数据预处理  
            raw_data = input_data.get('data', [])  
              
            # 创建DataFrame  
            df = pd.DataFrame(raw_data)  
            storage.store_dataframe('raw_dataframe', df, ttl='1h')  
              
            # 创建numpy数组  
            array = np.array([1, 2, 3, 4, 5])  
            storage.store_numpy_array('processing_array', array, ttl='1h')  
              
            # 存储中间结果  
            intermediate_result = {'processed_count': len(raw_data)}  
            storage.store_variable('intermediate_result', intermediate_result, ttl='1h')  
              
            return {  
                'status': 'step1_complete',  
                'message': 'Data preprocessing completed',  
                'next_step': 'process'  
            }  
              
        elif step == 'process':  
            # 第二步:数据处理  
            df = storage.get_dataframe('raw_dataframe')  
            array = storage.get_numpy_array('processing_array')  
            intermediate = storage.get_variable('intermediate_result')  
              
            if df is None or array is None:  
                return {'error': 'Missing intermediate data'}  
              
            # 执行计算  
            processed_df = df.copy()  
            processed_df['computed'] = processed_df.iloc[:, 0] * array[0] if len(df) > 0 else 0  
              
            # 存储处理结果  
            storage.store_dataframe('processed_dataframe', processed_df, ttl='2h')  
              
            # 更新中间结果  
            intermediate['processing_complete'] = True  
            storage.store_variable('intermediate_result', intermediate, ttl='2h')  
              
            return {  
                'status': 'step2_complete',  
                'message': 'Data processing completed',  
                'next_step': 'finalize'  
            }  
              
        elif step == 'finalize':  
            # 第三步:最终化  
            processed_df = storage.get_dataframe('processed_dataframe')  
            intermediate = storage.get_variable('intermediate_result')  
              
            if processed_df is None:  
                return {'error': 'Missing processed data'}  
              
            # 生成最终结果  
            final_result = {  
                'total_rows': len(processed_df),  
                'computed_sum': processed_df['computed'].sum() if 'computed' in processed_df.columns else 0,  
                'metadata': intermediate  
            }  
              
            # 存储最终结果  
            storage.store_variable('final_result', final_result, ttl='24h')  
              
            return {  
                'status': 'complete',  
                'result': final_result  
            }  
              
        else:  
            return {'error': f'Unknown step: {step}'}  
              
    except Exception as e:  
        return {'error': str(e)}  
  
# 不使用装饰器的版本  
def handle_manual(req):  
    """手动管理存储的示例"""  
    storage = OpenFaaSStorage()  
      
    # 手动创建任务  
    storage.create_task('manual-processing')  
      
    # 存储和获取变量  
    storage.store_variable('temp_var', {'key': 'value'})  
    result = storage.get_variable('temp_var')  
      
    return {'stored_and_retrieved': result}

6.  Python函数req变量的mock数据示例

1. 基本的JSON请求数据

# 第一步:数据预处理  
req_step1 = {  
    "step": "start",  
    "data": [  
        {"id": 1, "value": 10, "category": "A"},  
        {"id": 2, "value": 20, "category": "B"},  
        {"id": 3, "value": 15, "category": "A"},  
        {"id": 4, "value": 25, "category": "C"}  
    ],  
    "task_config": {  
        "batch_size": 100,  
        "timeout": 300  
    }  
}

2. 数据处理步骤的请求

# 第二步:数据处理  
req_step2 = {  
    "step": "process",  
    "processing_options": {  
        "algorithm": "linear_regression",  
        "normalize": True,  
        "feature_columns": ["value"]  
    }  
}

3. 最终化步骤的请求

# 第三步:最终化  
req_step3 = {  
    "step": "finalize",  
    "output_format": "json",  
    "include_metadata": True,  
    "export_options": {  
        "compress": False,  
        "format": "csv"  
    }  
}

4. 复杂数据结构的请求

# 包含numpy数组和pandas DataFrame数据的请求  
req_complex = {  
    "step": "start",  
    "data": {  
        "matrix_data": [  
            [1.0, 2.0, 3.0],  
            [4.0, 5.0, 6.0],  
            [7.0, 8.0, 9.0]  
        ],  
        "time_series": {  
            "timestamps": ["2023-01-01", "2023-01-02", "2023-01-03"],  
            "values": [100, 150, 120]  
        },  
        "metadata": {  
            "source": "sensor_data",  
            "version": "1.0",  
            "created_at": "2023-12-01T10:00:00Z"  
        }  
    },  
    "processing_params": {  
        "window_size": 5,  
        "overlap": 0.5,  
        "method": "moving_average"  
    }  
}

5. 错误处理场景的请求

# 无效步骤的请求  
req_invalid = {  
    "step": "unknown_step",  
    "data": []  
}  
  
# 缺少必要参数的请求  
req_missing = {  
    "step": "process"  
    # 缺少data字段  
}

6. 实际使用中的完整示例

# 模拟实际HTTP请求中的req变量  
def test_python_function():  
    from handler import handle  
      
    # 测试第一步  
    result1 = handle(json.dumps(req_step1))  
    print("Step 1 result:", result1)  
      
    # 测试第二步  
    result2 = handle(json.dumps(req_step2))  
    print("Step 2 result:", result2)  
      
    # 测试第三步  
    result3 = handle(json.dumps(req_step3))  
    print("Step 3 result:", result3)

7.窗口算法Python函数实现

基于之前实现的存储系统,将创建一个支持多种窗口算法的Python函数:

# window_algorithm_function/handler.py  
import json  
import numpy as np  
import pandas as pd  
from typing import List, Dict, Any, Optional, Union  
from openfaas_storage import OpenFaaSStorage, with_storage  
  
class WindowAlgorithms:  
    """窗口算法实现类"""  
      
    @staticmethod  
    def sliding_window(data: np.ndarray, window_size: int, step: int = 1) -> np.ndarray:  
        """滑动窗口算法"""  
        if len(data) < window_size:  
            return np.array([data])  
          
        windows = []  
        for i in range(0, len(data) - window_size + 1, step):  
            windows.append(data[i:i + window_size])  
        return np.array(windows)  
      
    @staticmethod  
    def tumbling_window(data: np.ndarray, window_size: int) -> np.ndarray:  
        """翻滚窗口算法"""  
        windows = []  
        for i in range(0, len(data), window_size):  
            window = data[i:i + window_size]  
            if len(window) == window_size:  # 只保留完整窗口  
                windows.append(window)  
        return np.array(windows)  
      
    @staticmethod  
    def hopping_window(data: np.ndarray, window_size: int, hop_size: int) -> np.ndarray:  
        """跳跃窗口算法"""  
        windows = []  
        for i in range(0, len(data) - window_size + 1, hop_size):  
            windows.append(data[i:i + window_size])  
        return np.array(windows)  
      
    @staticmethod  
    def session_window(data: np.ndarray, timestamps: np.ndarray, gap_threshold: float) -> List[np.ndarray]:  
        """会话窗口算法"""  
        if len(data) != len(timestamps):  
            raise ValueError("Data and timestamps must have the same length")  
          
        sessions = []  
        current_session = [data[0]]  
        current_timestamps = [timestamps[0]]  
          
        for i in range(1, len(data)):  
            time_gap = timestamps[i] - timestamps[i-1]  
            if time_gap <= gap_threshold:  
                current_session.append(data[i])  
                current_timestamps.append(timestamps[i])  
            else:  
                sessions.append(np.array(current_session))  
                current_session = [data[i]]  
                current_timestamps = [timestamps[i]]  
          
        if current_session:  
            sessions.append(np.array(current_session))  
          
        return sessions  
  
class WindowProcessor:  
    """窗口处理器,集成各种窗口算法和聚合函数"""  
      
    def __init__(self, storage: OpenFaaSStorage):  
        self.storage = storage  
        self.algorithms = WindowAlgorithms()  
      
    def process_sliding_window(self, data: np.ndarray, config: Dict[str, Any]) -> Dict[str, Any]:  
        """处理滑动窗口"""  
        window_size = config.get('window_size', 5)  
        step = config.get('step', 1)  
        aggregation = config.get('aggregation', 'mean')  
          
        windows = self.algorithms.sliding_window(data, window_size, step)  
          
        # 应用聚合函数  
        results = self._apply_aggregation(windows, aggregation)  
          
        # 存储中间结果  
        self.storage.store_numpy_array('sliding_windows', windows, ttl='1h')  
        self.storage.store_variable('sliding_results', results.tolist(), ttl='1h')  
          
        return {  
            'window_type': 'sliding',  
            'window_count': len(windows),  
            'aggregated_results': results.tolist(),  
            'config': config  
        }  
      
    def process_tumbling_window(self, data: np.ndarray, config: Dict[str, Any]) -> Dict[str, Any]:  
        """处理翻滚窗口"""  
        window_size = config.get('window_size', 5)  
        aggregation = config.get('aggregation', 'mean')  
          
        windows = self.algorithms.tumbling_window(data, window_size)  
        results = self._apply_aggregation(windows, aggregation)  
          
        # 存储结果  
        self.storage.store_numpy_array('tumbling_windows', windows, ttl='1h')  
        self.storage.store_variable('tumbling_results', results.tolist(), ttl='1h')  
          
        return {  
            'window_type': 'tumbling',  
            'window_count': len(windows),  
            'aggregated_results': results.tolist(),  
            'config': config  
        }  
      
    def process_hopping_window(self, data: np.ndarray, config: Dict[str, Any]) -> Dict[str, Any]:  
        """处理跳跃窗口"""  
        window_size = config.get('window_size', 5)  
        hop_size = config.get('hop_size', 2)  
        aggregation = config.get('aggregation', 'mean')  
          
        windows = self.algorithms.hopping_window(data, window_size, hop_size)  
        results = self._apply_aggregation(windows, aggregation)  
          
        # 存储结果  
        self.storage.store_numpy_array('hopping_windows', windows, ttl='1h')  
        self.storage.store_variable('hopping_results', results.tolist(), ttl='1h')  
          
        return {  
            'window_type': 'hopping',  
            'window_count': len(windows),  
            'aggregated_results': results.tolist(),  
            'config': config  
        }  
      
    def process_session_window(self, data: np.ndarray, timestamps: np.ndarray, config: Dict[str, Any]) -> Dict[str, Any]:  
        """处理会话窗口"""  
        gap_threshold = config.get('gap_threshold', 5.0)  
        aggregation = config.get('aggregation', 'mean')  
          
        sessions = self.algorithms.session_window(data, timestamps, gap_threshold)  
          
        # 对每个会话应用聚合函数  
        session_results = []  
        for i, session in enumerate(sessions):  
            result = self._apply_aggregation(session.reshape(1, -1), aggregation)[0]  
            session_results.append(result)  
            # 存储每个会话  
            self.storage.store_numpy_array(f'session_{i}', session, ttl='1h')  
          
        self.storage.store_variable('session_results', session_results, ttl='1h')  
          
        return {  
            'window_type': 'session',  
            'session_count': len(sessions),  
            'session_sizes': [len(session) for session in sessions],  
            'aggregated_results': session_results,  
            'config': config  
        }  
      
    def _apply_aggregation(self, windows: np.ndarray, aggregation: str) -> np.ndarray:  
        """应用聚合函数到窗口数据"""  
        if aggregation == 'mean':  
            return np.mean(windows, axis=1)  
        elif aggregation == 'sum':  
            return np.sum(windows, axis=1)  
        elif aggregation == 'max':  
            return np.max(windows, axis=1)  
        elif aggregation == 'min':  
            return np.min(windows, axis=1)  
        elif aggregation == 'std':  
            return np.std(windows, axis=1)  
        elif aggregation == 'var':  
            return np.var(windows, axis=1)  
        elif aggregation == 'median':  
            return np.median(windows, axis=1)  
        else:  
            raise ValueError(f"Unsupported aggregation function: {aggregation}")  
  
@with_storage("window-algorithm-processor")  
def handle(req, storage: OpenFaaSStorage):  
    """主处理函数"""  
    try:  
        # 解析输入  
        input_data = json.loads(req) if isinstance(req, str) else req  
          
        # 获取处理步骤  
        step = input_data.get('step', 'process')  
          
        if step == 'process':  
            # 处理窗口算法  
            return process_window_algorithm(input_data, storage)  
        elif step == 'analyze':  
            # 分析已处理的窗口结果  
            return analyze_window_results(input_data, storage)  
        elif step == 'compare':  
            # 比较不同窗口算法的结果  
            return compare_window_algorithms(input_data, storage)  
        else:  
            return {'error': f'Unknown step: {step}'}  
              
    except Exception as e:  
        return {'error': str(e)}  
  
def process_window_algorithm(input_data: Dict[str, Any], storage: OpenFaaSStorage) -> Dict[str, Any]:  
    """处理窗口算法"""  
    # 获取输入数据  
    data_array = np.array(input_data.get('data', []))  
    timestamps = np.array(input_data.get('timestamps', []))  
    window_config = input_data.get('window_config', {})  
    window_type = window_config.get('type', 'sliding')  
      
    if len(data_array) == 0:  
        return {'error': 'No data provided'}  
      
    # 创建窗口处理器  
    processor = WindowProcessor(storage)  
      
    # 根据窗口类型处理  
    if window_type == 'sliding':  
        result = processor.process_sliding_window(data_array, window_config)  
    elif window_type == 'tumbling':  
        result = processor.process_tumbling_window(data_array, window_config)  
    elif window_type == 'hopping':  
        result = processor.process_hopping_window(data_array, window_config)  
    elif window_type == 'session':  
        if len(timestamps) == 0:  
            return {'error': 'Timestamps required for session windows'}  
        result = processor.process_session_window(data_array, timestamps, window_config)  
    else:  
        return {'error': f'Unsupported window type: {window_type}'}  
      
    # 存储原始数据  
    storage.store_numpy_array('original_data', data_array, ttl='2h')  
    if len(timestamps) > 0:  
        storage.store_numpy_array('timestamps', timestamps, ttl='2h')  
      
    return {  
        'status': 'success',  
        'message': f'{window_type.capitalize()} window processing completed',  
        'result': result,  
        'data_size': len(data_array)  
    }  
  
def analyze_window_results(input_data: Dict[str, Any], storage: OpenFaaSStorage) -> Dict[str, Any]:  
    """分析窗口处理结果"""  
    window_type = input_data.get('window_type', 'sliding')  
      
    # 获取存储的结果  
    results_key = f'{window_type}_results'  
    results = storage.get_variable(results_key)  
      
    if results is None:  
        return {'error': f'No results found for {window_type} windows'}  
      
    results_array = np.array(results)  
      
    # 计算统计信息  
    analysis = {  
        'window_type': window_type,  
        'total_windows': len(results_array),  
        'statistics': {  
            'mean': float(np.mean(results_array)),  
            'std': float(np.std(results_array)),  
            'min': float(np.min(results_array)),  
            'max': float(np.max(results_array)),  
            'median': float(np.median(results_array)),  
            'q25': float(np.percentile(results_array, 25)),  
            'q75': float(np.percentile(results_array, 75))  
        }  
    }  
      
    # 存储分析结果  
    storage.store_variable(f'{window_type}_analysis', analysis, ttl='2h')  
      
    return {  
        'status': 'success',  
        'message': f'{window_type.capitalize()} window analysis completed',  
        'analysis': analysis  
    }  
  
def compare_window_algorithms(input_data: Dict[str, Any], storage: OpenFaaSStorage) -> Dict[str, Any]:  
    """比较不同窗口算法的结果"""  
    window_types = input_data.get('window_types', ['sliding', 'tumbling', 'hopping'])  
      
    comparison = {}  
      
    for window_type in window_types:  
        results = storage.get_variable(f'{window_type}_results')  
        if results is not None:  
            results_array = np.array(results)  
            comparison[window_type] = {  
                'count': len(results_array),  
                'mean': float(np.mean(results_array)),  
                'std': float(np.std(results_array)),  
                'range': float(np.max(results_array) - np.min(results_array))  
            }  
      
    # 存储比较结果  
    storage.store_variable('window_comparison', comparison, ttl='2h')  
      
    return {  
        'status': 'success',  
        'message': 'Window algorithm comparison completed',  
        'comparison': comparison  
    }

使用示例

# 调用示例  
import requests  
import json  
import numpy as np  
  
# 生成测试数据  
data = np.random.randn(100).tolist()  
timestamps = np.arange(100).tolist()  
  
# 滑动窗口处理  
sliding_request = {  
    "step": "process",  
    "data": data,  
    "window_config": {  
        "type": "sliding",  
        "window_size": 10,  
        "step": 2,  
        "aggregation": "mean"  
    }  
}  
  
response = requests.post(  
    'http://gateway:8080/function/window-algorithm-processor',  
    json=sliding_request,  
    headers={'X-Task-ID': 'window-task-123'}  
)  
  
print("Sliding window result:", response.json())  
  
# 会话窗口处理  
session_request = {  
    "step": "process",  
    "data": data,  
    "timestamps": timestamps,  
    "window_config": {  
        "type": "session",  
        "gap_threshold": 5.0,  
        "aggregation": "mean"  
    }  
}  
  
response = requests.post(  
    'http://gateway:8080/function/window-algorithm-processor',  
    json=session_request,  
    headers={'X-Task-ID': 'window-task-123'}  
)  
  
print("Session window result:", response.json())

8. 配置文件

# function.yml  
version: 1.0  
provider:  
  name: openfaas  
  gateway: http://127.0.0.1:8080  
  
functions:  
  data-processor:  
    lang: python3  
    handler: ./handler  
    image: data-processor:latest  
    environment:  
      OPENFAAS_GATEWAY_URL: "http://gateway:8080"  
      NATS_URL: "nats://nats:4222"  
    secrets:  
      - openfaas-auth

9. 部署脚本

#!/bin/bash  
# deploy.sh  
  
# 构建函数  
faas-cli build -f function.yml  
  
# 部署函数  
faas-cli deploy -f function.yml  
  
# 设置环境变量  
kubectl create secret generic openfaas-auth \  
  --from-literal=username=admin \  
  --from-literal=password=password \  
  -n openfaas-fn

使用示例

# 调用示例  
import requests  
import json  
  
# 第一步调用  
response1 = requests.post(  
    'http://gateway:8080/function/data-processor',  
    json={  
        'step': 'start',  
        'data': [{'id': 1, 'value': 10}, {'id': 2, 'value': 20}]  
    },  
    headers={'X-Task-ID': 'task-123'}  
)  
  
# 第二步调用  
response2 = requests.post(  
    'http://gateway:8080/function/data-processor',  
    json={'step': 'process'},  
    headers={'X-Task-ID': 'task-123'}  
)  
  
# 第三步调用  
response3 = requests.post(  
    'http://gateway:8080/function/data-processor',  
    json={'step': 'finalize'},  
    headers={'X-Task-ID': 'task-123'}  
)  
  
print(response3.json())