Open vSwitchのソースコードを読む(10) flow table の検索

2013-06-18 by Daisuke Kotani

今回は、kernel module から渡されたパケットがどのように処理されるのかを追いながら、flow tableの検索について調べていきます。

handle_upcalls

kernel module で flow miss をしたものは、userspace process の handle_upcalls (ofproto-dpif.c) で処理されます。

static int
handle_upcalls(struct dpif_backer *backer, unsigned int max_batch)
{
    struct dpif_upcall misses[FLOW_MISS_MAX_BATCH];
    struct ofpbuf miss_bufs[FLOW_MISS_MAX_BATCH];
    uint64_t miss_buf_stubs[FLOW_MISS_MAX_BATCH][4096 / 8];
    int n_processed;
    int n_misses;
    int i;

    ovs_assert(max_batch <= FLOW_MISS_MAX_BATCH);

    n_misses = 0;
    for (n_processed = 0; n_processed < max_batch; n_processed++) {
        struct dpif_upcall *upcall = &misses[n_misses];
        struct ofpbuf *buf = &miss_bufs[n_misses];
        int error;

        ofpbuf_use_stub(buf, miss_buf_stubs[n_misses],
                        sizeof miss_buf_stubs[n_misses]);
        error = dpif_recv(backer->dpif, upcall, buf);
        if (error) {
            ofpbuf_uninit(buf);
            break;
        }

        switch (classify_upcall(upcall)) {
        case MISS_UPCALL:
            /* Handle it later. */
            n_misses++;
            break;

        case SFLOW_UPCALL:
            handle_sflow_upcall(backer, upcall);
            ofpbuf_uninit(buf);
            break;

        case BAD_UPCALL:
            ofpbuf_uninit(buf);
            break;
        }
    }

    /* Handle deferred MISS_UPCALL processing. */
    handle_miss_upcalls(backer, misses, n_misses);
    for (i = 0; i < n_misses; i++) {
        ofpbuf_uninit(&miss_bufs[i]);
    }

    return n_processed;
}

これを眺めてざっと分かることは、1回のhandle_upcallsで処理できるパケット数は max_batch で制限されていること、flow miss によるものは handle_miss_upcalls 関数で処理されることです。SFLOW_UPCALLはsflowに関するものですが、今回はパスします。

handle_miss_upcalls 関数に進みます。同じflowのパケットを集めて、まとめて処理をする、ということをしています。

static void
handle_miss_upcalls(struct dpif_backer *backer, struct dpif_upcall *upcalls,
                    size_t n_upcalls)
{
    struct dpif_upcall *upcall;
    struct flow_miss *miss;
    struct flow_miss misses[FLOW_MISS_MAX_BATCH];
    struct flow_miss_op flow_miss_ops[FLOW_MISS_MAX_BATCH * 2];
    struct dpif_op *dpif_ops[FLOW_MISS_MAX_BATCH * 2];
    struct hmap todo;
    int n_misses;
    size_t n_ops;
    size_t i;

    if (!n_upcalls) {
        return;
    }

    /* Construct the to-do list.
     *
     * This just amounts to extracting the flow from each packet and sticking
     * the packets that have the same flow in the same "flow_miss" structure so
     * that we can process them together. */
    hmap_init(&todo);
    n_misses = 0;
    for (upcall = upcalls; upcall < &upcalls[n_upcalls]; upcall++) {
        struct flow_miss *miss = &misses[n_misses];
        struct flow_miss *existing_miss;
        struct ofproto_dpif *ofproto;
        uint32_t odp_in_port;
        struct flow flow;
        uint32_t hash;
        int error;

        error = ofproto_receive(backer, upcall->packet, upcall->key,
                                upcall->key_len, &flow, &miss->key_fitness,
                                &ofproto, &odp_in_port, &miss->initial_vals);

        if (error == ENODEV) {
            struct drop_key *drop_key;

            /* Received packet on port for which we couldn't associate
             * an ofproto.  This can happen if a port is removed while
             * traffic is being received.  Print a rate-limited message
             * in case it happens frequently.  Install a drop flow so
             * that future packets of the flow are inexpensively dropped
             * in the kernel. */
            VLOG_INFO_RL(&rl, "received packet on unassociated port %"PRIu32,
                         flow.in_port);

            drop_key = drop_key_lookup(backer, upcall->key, upcall->key_len);
            if (!drop_key) {
                drop_key = xmalloc(sizeof *drop_key);
                drop_key->key = xmemdup(upcall->key, upcall->key_len);
                drop_key->key_len = upcall->key_len;

                hmap_insert(&backer->drop_keys, &drop_key->hmap_node,
                            hash_bytes(drop_key->key, drop_key->key_len, 0));
                dpif_flow_put(backer->dpif, DPIF_FP_CREATE | DPIF_FP_MODIFY,
                              drop_key->key, drop_key->key_len, NULL, 0, NULL);
            }
            continue;
        }
        if (error) {
            continue;
        }

ここまでは、port が正しいかどうか、正しくなければ以降そのポートからのパケットをdropするルールを設定する処理です。「ルールを設定する」というのはdpif_flow_put関数で行われています。

        ofproto->n_missed++;
        flow_extract(upcall->packet, flow.skb_priority, flow.skb_mark,
                     &flow.tunnel, flow.in_port, &miss->flow);

        /* Add other packets to a to-do list. */
        hash = flow_hash(&miss->flow, 0);
        existing_miss = flow_miss_find(&todo, ofproto, &miss->flow, hash);
        if (!existing_miss) {
            hmap_insert(&todo, &miss->hmap_node, hash);
            miss->ofproto = ofproto;
            miss->key = upcall->key;
            miss->key_len = upcall->key_len;
            miss->upcall_type = upcall->type;
            miss->odp_in_port = odp_in_port;
            list_init(&miss->packets);

            n_misses++;
        } else {
            miss = existing_miss;
        }
        list_push_back(&miss->packets, &upcall->packet->list_node);
    }

ここが、パケットをflow毎にまとめている部分です。flow_hash で hash 値を計算し、もしハッシュ値がtodoになければinsertしています。ハッシュ値毎にflow_miss構造体が用意され、flow_miss 構造体の中の packets というリストにそれに該当するパケットを保持しています。

    /* Process each element in the to-do list, constructing the set of
     * operations to batch. */
    n_ops = 0;
    HMAP_FOR_EACH (miss, hmap_node, &todo) {
        handle_flow_miss(miss, flow_miss_ops, &n_ops);
    }
    ovs_assert(n_ops <= ARRAY_SIZE(flow_miss_ops));

コメントにもありますが、これはflow毎にどのような処理を行うかを検索している部分です。handle_flow_miss 関数は次に見ます。

    /* Execute batch. */
    for (i = 0; i < n_ops; i++) {
        dpif_ops[i] = &flow_miss_ops[i].dpif_op;
    }
    dpif_operate(backer->dpif, dpif_ops, n_ops);

    /* Free memory. */
    for (i = 0; i < n_ops; i++) {
        free(flow_miss_ops[i].garbage);
    }
    hmap_destroy(&todo);
}

処理を実行して終わり。

handle_flow_miss

handle_flow_miss 関数は ofproto-dpif.c にあります。指定された flow をどのように処理すればよいのかを検索するものです。

/* Handles flow miss 'miss'.  May add any required datapath operations
 * to 'ops', incrementing '*n_ops' for each new op. */
static void
handle_flow_miss(struct flow_miss *miss, struct flow_miss_op *ops,
                 size_t *n_ops)
{
    struct ofproto_dpif *ofproto = miss->ofproto;
    struct facet *facet;
    long long int now;
    uint32_t hash;

    /* The caller must ensure that miss->hmap_node.hash contains
     * flow_hash(miss->flow, 0). */
    hash = miss->hmap_node.hash;

    facet = facet_lookup_valid(ofproto, &miss->flow, hash);
    if (!facet) {
        struct rule_dpif *rule = rule_dpif_lookup(ofproto, &miss->flow);

        if (!flow_miss_should_make_facet(ofproto, miss, hash)) {
            handle_flow_miss_without_facet(miss, rule, ops, n_ops);
            return;
        }

        facet = facet_create(rule, &miss->flow, hash);
        now = facet->used;
    } else {
        now = time_msec();
    }
    handle_flow_miss_with_facet(miss, facet, now, ops, n_ops);
}

facet が存在するかどうか、facet を生成してもよいかで分けています。 flow_miss_should_make_facet 関数のコメントを読むと、だいたい YES だと書いてあるので、次にhandle_flow_miss_with_facet関数を見ていきます。rule の検索をしている rule_dpif_lookup 関数は後回しにします。

handle_flow_miss_with_facet

ここで行われている処理は、subfacetを作って、datapath に設定する操作を行うようにflow_miss_opsにデータを追加することです。もし、datapath で十分なパケットとflowのmatchができなければ、slowpathに追加します。

/* Handles 'miss', which matches 'facet'.  May add any required datapath
 * operations to 'ops', incrementing '*n_ops' for each new op.
 *
 * All of the packets in 'miss' are considered to have arrived at time 'now'.
 * This is really important only for new facets: if we just called time_msec()
 * here, then the new subfacet or its packets could look (occasionally) as
 * though it was used some time after the facet was used.  That can make a
 * one-packet flow look like it has a nonzero duration, which looks odd in
 * e.g. NetFlow statistics. */
static void
handle_flow_miss_with_facet(struct flow_miss *miss, struct facet *facet,
                            long long int now,
                            struct flow_miss_op *ops, size_t *n_ops)
{
    struct ofproto_dpif *ofproto = ofproto_dpif_cast(facet->rule->up.ofproto);
    enum subfacet_path want_path;
    struct subfacet *subfacet;
    struct ofpbuf *packet;

    subfacet = subfacet_create(facet, miss, now);

    LIST_FOR_EACH (packet, list_node, &miss->packets) {
        struct flow_miss_op *op = &ops[*n_ops];
        struct dpif_flow_stats stats;
        struct ofpbuf odp_actions;

        handle_flow_miss_common(facet->rule, packet, &miss->flow);

        ofpbuf_use_stub(&odp_actions, op->stub, sizeof op->stub);
        if (!subfacet->actions || subfacet->slow) {
            subfacet_make_actions(subfacet, packet, &odp_actions);
        }

        dpif_flow_stats_extract(&facet->flow, packet, now, &stats);
        subfacet_update_stats(subfacet, &stats);

        if (subfacet->actions_len) {
            struct dpif_execute *execute = &op->dpif_op.u.execute;

            init_flow_miss_execute_op(miss, packet, op);
            if (!subfacet->slow) {
                execute->actions = subfacet->actions;
                execute->actions_len = subfacet->actions_len;
                ofpbuf_uninit(&odp_actions);
            } else {
                execute->actions = odp_actions.data;
                execute->actions_len = odp_actions.size;
                op->garbage = ofpbuf_get_uninit_pointer(&odp_actions);
            }

            (*n_ops)++;
        } else {
            ofpbuf_uninit(&odp_actions);
        }
    }

    want_path = subfacet_want_path(subfacet->slow);
    if (miss->upcall_type == DPIF_UC_MISS || subfacet->path != want_path) {
        struct flow_miss_op *op = &ops[(*n_ops)++];
        struct dpif_flow_put *put = &op->dpif_op.u.flow_put;

        subfacet->path = want_path;

        op->garbage = NULL;
        op->dpif_op.type = DPIF_OP_FLOW_PUT;
        put->flags = DPIF_FP_CREATE | DPIF_FP_MODIFY;
        put->key = miss->key;
        put->key_len = miss->key_len;
        if (want_path == SF_FAST_PATH) {
            put->actions = subfacet->actions;
            put->actions_len = subfacet->actions_len;
        } else {
            compose_slow_path(ofproto, &facet->flow, subfacet->slow,
                              op->stub, sizeof op->stub,
                              &put->actions, &put->actions_len);
        }
        put->stats = NULL;
    }
}

rule_dpif_lookup

flowを処理するルールを検索する関数が rule_dpif_lookup です。実際に検索している関数は rule_dpif_lookup__ 関数で、もしルールが見つからなかったときには、デフォルトのルールを返すようにしています(rule_dpif_miss_rule)。

/* Rules. */

static struct rule_dpif *
rule_dpif_lookup(struct ofproto_dpif *ofproto, const struct flow *flow)
{
    struct rule_dpif *rule;

    rule = rule_dpif_lookup__(ofproto, flow, 0);
    if (rule) {
        return rule;
    }

    return rule_dpif_miss_rule(ofproto, flow);
}

rule_dpif_lookup__ 関数を見ていきます。

static struct rule_dpif *
rule_dpif_lookup__(struct ofproto_dpif *ofproto, const struct flow *flow,
                   uint8_t table_id)
{
    struct cls_rule *cls_rule;
    struct classifier *cls;

    if (table_id >= N_TABLES) {
        return NULL;
    }

    cls = &ofproto->up.tables[table_id].cls;
    if (flow->nw_frag & FLOW_NW_FRAG_ANY
        && ofproto->up.frag_handling == OFPC_FRAG_NORMAL) {
        /* For OFPC_NORMAL frag_handling, we must pretend that transport ports
         * are unavailable. */
        struct flow ofpc_normal_flow = *flow;
        ofpc_normal_flow.tp_src = htons(0);
        ofpc_normal_flow.tp_dst = htons(0);
        cls_rule = classifier_lookup(cls, &ofpc_normal_flow);
    } else {
        cls_rule = classifier_lookup(cls, flow);
    }
    return rule_dpif_cast(rule_from_cls_rule(cls_rule));
}

実際に検索しているのは、classifier_lookup のようなので、classifier_lookup 関数を見ます。これはlib/classifier.c　にあります。

/* Finds and returns the highest-priority rule in 'cls' that matches 'flow'.
 * Returns a null pointer if no rules in 'cls' match 'flow'.  If multiple rules
 * of equal priority match 'flow', returns one arbitrarily. */
struct cls_rule *
classifier_lookup(const struct classifier *cls, const struct flow *flow)
{
    struct cls_table *table;
    struct cls_rule *best;

    best = NULL;
    HMAP_FOR_EACH (table, hmap_node, &cls->tables) {
        struct cls_rule *rule = find_match(table, flow);
        if (rule && (!best || rule->priority > best->priority)) {
            best = rule;
        }
    }
    return best;
}

cls->tables 毎に find_match を実行しています。1つ1つのtableはcls_table構造体で定義されています。

/* A set of rules that all have the same fields wildcarded. */
struct cls_table {
    struct hmap_node hmap_node; /* Within struct classifier 'tables' hmap. */
    struct hmap rules;          /* Contains "struct cls_rule"s. */
    struct minimask mask;       /* Wildcards for fields. */
    int n_table_rules;          /* Number of rules, including duplicates. */
};

Wildcards for fields という言葉が気になります。どうやら、Wildcard の種類毎に Table が作られているみたいです。こうして Wildcard の平均 lookup 時間を短くしているんだと思います（計算量を出してみないと本当に短くなるのか分かりませんが、直感的には短くなりそうです）。

find_match 関数の中は次のようになっています。

static struct cls_rule *
find_match(const struct cls_table *table, const struct flow *flow)
{
    uint32_t hash = flow_hash_in_minimask(flow, &table->mask, 0);
    struct cls_rule *rule;

    HMAP_FOR_EACH_WITH_HASH (rule, hmap_node, hash, &table->rules) {
        if (miniflow_equal_flow_in_minimask(&rule->match.flow, flow,
                                            &table->mask)) {
            return rule;
        }
    }

    return NULL;
}

flow_hash_in_minimask というのは、mask で隠されていない部分のみの値を使ってハッシュ値を計算する関数です。これを使って、ハッシュテーブルを検索しています。

ここで出てきた、miniflow や minimask とは何なのか、次でとても簡潔に説明します。

miniflow, minimask

flow.h に miniflow の説明があるので、引用します。miniflow構造体の定義の上部です。

/* A sparse representation of a "struct flow".
 *
 * A "struct flow" is fairly large and tends to be mostly zeros.  Sparse
 * representation has two advantages.  First, it saves memory.  Second, it
 * saves time when the goal is to iterate over only the nonzero parts of the
 * struct.
 *
 * The 'map' member holds one bit for each uint32_t in a "struct flow".  Each
 * 0-bit indicates that the corresponding uint32_t is zero, each 1-bit that it
 * is nonzero.
 *
 * 'values' points to the start of an array that has one element for each 1-bit
 * in 'map'.  The least-numbered 1-bit is in values[0], the next 1-bit is in
 * values[1], and so on.
 *
 * 'values' may point to a few different locations:
 *
 *     - If 'map' has MINI_N_INLINE or fewer 1-bits, it may point to
 *       'inline_values'.  One hopes that this is the common case.
 *
 *     - If 'map' has more than MINI_N_INLINE 1-bits, it may point to memory
 *       allocated with malloc().
 *
 *     - The caller could provide storage on the stack for situations where
 *       that makes sense.  So far that's only proved useful for
 *       minimask_combine(), but the principle works elsewhere.
 *
 * The implementation maintains and depends on the invariant that every element
 * in 'values' is nonzero; that is, wherever a 1-bit appears in 'map', the
 * corresponding element of 'values' must be nonzero.
 */

また、gitのcommitlogにも説明があります。

miniflow が作られるきっかけとなった問題は、flow構造体が大きくなっていて、flowの検索にflow構造体の大きさに比例した時間がかかるようになっていたということです。flow構造体のメンバーの多くの値は0という傾向があるそうなので、0でないところはどこかを記録しておいて、そこだけ一致するかどうか検索しようというのがminiflowです。

アルゴリズムはちょっとまだ理解できていないので、もし機会があれば...

次回

次回は、flow の追加や削除について調べていこうと思います。

Links
About me