Siehe auch [[ElasticSearch]] (z.B Scripte, API, etc. sind identisch). [[Wazuh]] verwendet [[OpenSearch]]. =====Installation===== ====Demo==== Run a local cluster


docker run --rm -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:2.6.0

Create a python script


from opensearchpy import OpenSearch

client = OpenSearch(
    hosts = [{"host": "localhost", "port": 9200}],
    http_auth = ("admin", "admin"),
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)
client.info()

Get some random data for e.g [[https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots|wikipedia-movie-plots]]. Read the data into a pandas array.


import pandas as pd

df = (
    pd.read_csv("wiki_movie_plots_deduped.csv")
    .dropna()
    .sample(5000, random_state=42)
    .reset_index(drop=True)
)

Create an index


body = {
    "mappings":{
        "properties": {
            "title": {"type": "text", "analyzer": "english"},
            "ethnicity": {"type": "text", "analyzer": "standard"},
            "director": {"type": "text", "analyzer": "standard"},
            "cast": {"type": "text", "analyzer": "standard"},
            "genre": {"type": "text", "analyzer": "standard"},
            "plot": {"type": "text", "analyzer": "english"},
            "year": {"type": "integer"},
            "wiki_page": {"type": "keyword"}
        }
    }
}
response = client.indices.create("movies", body=body)

Push the data into the index


for i, row in df.iterrows():
    body = {
            "title": row["Title"],
            "ethnicity": row["Origin/Ethnicity"],
            "director": row["Director"],
            "cast": row["Cast"],
            "genre": row["Genre"],
            "plot": row["Plot"],
            "year": row["Release Year"],
            "wiki_page": row["Wiki Page"]
    }
    client.index(index="movies", id=i, body=body)

More data in a bulk


from opensearchpy.helpers import bulk

bulk_data = []
for i,row in df.iterrows():
    bulk_data.append(
        {
            "_index": "movies",
            "_id": i,
            "_source": {
                "title": row["Title"],
                "ethnicity": row["Origin/Ethnicity"],
                "director": row["Director"],
                "cast": row["Cast"],
                "genre": row["Genre"],
                "plot": row["Plot"],
                "year": row["Release Year"],
                "wiki_page": row["Wiki Page"],
            }
        }
    )
bulk(client, bulk_data)

Count the inserted data


client.indices.refresh(index="movies")
client.cat.count(index="movies", format="json")

Search the data


resp = client.search(
    index="movies",
    body={
        "query": {
            "bool": {
                "must": {
                    "match_phrase": {
                        "cast": "jack nicholson",
                    }
                },
                "filter": {"bool": {"must_not": {"match_phrase": {"director": "tim burton"}}}},
            },
        },
    }
)
resp

Remove documents


client.delete(index="movies", id="2500")

Delete the index


client.indices.delete(index='movies')

====Production==== ===v1=== Run as root


#!/usr/bin/env bash
set -Eeuo pipefail

trap 'echo; echo "[FEHLER] setup.sh abgebrochen in Zeile $LINENO"; exit 1' ERR

if [[ "${EUID}" -ne 0 ]]; then
  echo "Bitte mit sudo/root ausführen."
  exit 1
fi

APP_ROOT="/opt/at-search"
NUTCH_VERSION="1.22"
OPENSEARCH_VERSION="1.3.20"
REAL_USER="${SUDO_USER:-root}"
REAL_GROUP="$(id -gn "$REAL_USER")"

echo "[1/12] Installiere Pakete"
export DEBIAN_FRONTEND=noninteractive
apt-get update
apt-get install -y \
  curl \
  wget \
  jq \
  tar \
  gzip \
  unzip \
  ca-certificates \
  gnupg \
  lsb-release \
  software-properties-common \
  docker.io \
  docker-compose \
  openjdk-11-jdk-headless

systemctl enable docker
systemctl restart docker

echo "[2/12] Setze vm.max_map_count"
if grep -q '^vm.max_map_count=' /etc/sysctl.conf; then
  sed -i 's/^vm.max_map_count=.*/vm.max_map_count=262144/' /etc/sysctl.conf
else
  echo 'vm.max_map_count=262144' >> /etc/sysctl.conf
fi
sysctl -w vm.max_map_count=262144 >/dev/null

echo "[3/12] Setze JAVA_HOME"
JAVA_HOME_PATH="$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")"
cat > /etc/profile.d/at-search-java.sh < "${APP_ROOT}/opensearch/docker-compose.yml" </dev/null 2>&1; then
    ok=1
    break
  fi
  sleep 2
done

if [[ "${ok}" != "1" ]]; then
  echo "[FEHLER] OpenSearch kam nicht hoch:"
  docker-compose logs --tail=200 opensearch || true
  exit 1
fi

echo "[8/12] Lege Index at_web neu an"
curl -fsS -X DELETE "http://127.0.0.1:9200/at_web" >/dev/null 2>&1 || true
curl -fsS -X PUT "http://127.0.0.1:9200/at_web" \
  -H 'Content-Type: application/json' \
  -d '{
    "settings": {
      "number_of_shards": 1,
      "number_of_replicas": 0
    },
    "mappings": {
      "properties": {
        "url": {"type": "keyword"},
        "host": {"type": "keyword"},
        "site": {"type": "keyword"},
        "title": {"type": "text"},
        "content": {"type": "text"},
        "digest": {"type": "keyword"},
        "tstamp": {"type": "date", "format": "strict_date_optional_time||epoch_millis"}
      }
    }
  }' >/dev/null

echo "[9/12] Lade Apache Nutch"
cd "${APP_ROOT}/downloads"
if [[ ! -f "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" ]]; then
  wget -O "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" \
    "https://dlcdn.apache.org/nutch/${NUTCH_VERSION}/apache-nutch-${NUTCH_VERSION}-bin.tar.gz"
fi

rm -rf "${APP_ROOT}/apache-nutch-${NUTCH_VERSION}"
tar -xzf "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" -C "${APP_ROOT}"
ln -sfn "${APP_ROOT}/apache-nutch-${NUTCH_VERSION}" "${APP_ROOT}/nutch"

mkdir -p "${APP_ROOT}/nutch/logs" "${APP_ROOT}/crawl"

echo "[10/12] Konfiguriere Nutch + Hadoop local paths"
cat > "${APP_ROOT}/nutch/conf/nutch-site.xml" <<'EOF'


  
    http.agent.name
    ATSearchBot
  

  
    http.robots.agents
    ATSearchBot,*
  

  
    plugin.includes
    protocol-httpclient|urlfilter-regex|parse-(html|tika)|index-(basic|anchor|more|metadata)|indexer-opensearch-1x|urlnormalizer-(pass|regex|basic)|scoring-opic
  

  
    fetcher.threads.fetch
    10
  

  
    http.content.limit
    1048576
  

  
    db.ignore.external.links
    false
  

  
    db.ignore.internal.links
    false
  

  
    db.ignore.external.links.mode
    byHost
  

  
    generate.max.per.host
    25
  

  
    generate.max.per.domain
    100
  

  
    generate.count.mode
    byDomain
  

  
    fetcher.parse
    true
  

  
    parser.character.encoding.default
    UTF-8
  

EOF

cat > "${APP_ROOT}/nutch/conf/core-site.xml" <

  
    fs.defaultFS
    file:///
  
  
    hadoop.tmp.dir
    ${APP_ROOT}/hadoop-tmp
  

EOF

cat > "${APP_ROOT}/nutch/conf/mapred-site.xml" <

  
    mapreduce.framework.name
    local
  
  
    mapreduce.jobtracker.staging.root.dir
    ${APP_ROOT}/hadoop-staging
  
  
    mapreduce.job.reduces
    1
  

EOF

cat > "${APP_ROOT}/nutch/conf/regex-urlfilter.txt" <<'EOF'
+^https?://([a-z0-9-]+\.)*(at|co\.at|gv\.at|ac\.at)(/.*)?$
-^https?://.+\.(7z|avi|bin|css|doc|docx|gif|gz|ico|iso|jar|jpeg|jpg|js|mov|mp3|mp4|pdf|png|rar|svg|tar|tgz|webp|woff|woff2|xls|xlsx|zip)(\?.*)?$
-^https?://.*(\?|&)(utm_|fbclid=|gclid=|reply=|share=|sort=|filter=|session=).*$
-.
EOF

cat > "${APP_ROOT}/nutch/conf/index-writers.xml" <<'EOF'




  
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    

    
      
        
      
      
      
        
      
    
  


EOF

cat > "${APP_ROOT}/seeds/seed.txt" <<'EOF'
https://www.oesterreich.gv.at/
https://www.parlament.gv.at/
https://www.orf.at/
https://www.derstandard.at/
https://www.wien.gv.at/
https://www.arbeiterkammer.at/
https://www.wko.at/
https://www.ams.at/
https://www.univie.ac.at/
https://www.tuwien.at/
EOF

echo "[11/12] Schreibe Wrapper"
cat > /usr/local/bin/at-search-crawl <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
export JAVA_HOME="${JAVA_HOME:-$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")}"
export NUTCH_HOME="/opt/at-search/nutch"
export NUTCH_CONF_DIR="${NUTCH_HOME}/conf"
export HADOOP_CONF_DIR="${NUTCH_HOME}/conf"
export HADOOP_OPTS="-Djava.io.tmpdir=/opt/at-search/hadoop-tmp"
mkdir -p /opt/at-search/crawl /opt/at-search/nutch/logs /opt/at-search/hadoop-tmp /opt/at-search/hadoop-staging
exec "${NUTCH_HOME}/bin/crawl" -i -s /opt/at-search/seeds /opt/at-search/crawl 2
EOF

cat > /usr/local/bin/at-search-reindex-last <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
export JAVA_HOME="${JAVA_HOME:-$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")}"
export NUTCH_HOME="/opt/at-search/nutch"
export NUTCH_CONF_DIR="${NUTCH_HOME}/conf"
export HADOOP_CONF_DIR="${NUTCH_HOME}/conf"
export HADOOP_OPTS="-Djava.io.tmpdir=/opt/at-search/hadoop-tmp"
SEG="$(ls -1 /opt/at-search/crawl/segments | sort | tail -n1)"
exec "${NUTCH_HOME}/bin/nutch" index /opt/at-search/crawl/crawldb -linkdb /opt/at-search/crawl/linkdb "/opt/at-search/crawl/segments/${SEG}" -deleteGone
EOF

cat > /usr/local/bin/at-search-query <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
if [[ $# -lt 1 ]]; then
  echo 'Nutzung: at-search-query "suchbegriff"'
  exit 1
fi
QUERY="$*"
BODY="$(jq -n --arg q "$QUERY" '{
  size: 10,
  query: {
    multi_match: {
      query: $q,
      fields: ["title^3","content","site^2","host"]
    }
  },
  _source: ["title","url","host","site"]
}')"
curl -fsS "http://127.0.0.1:9200/at_web/_search" \
  -H 'Content-Type: application/json' \
  -d "${BODY}" | jq .
EOF

cat > /usr/local/bin/at-search-status <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
echo "=== OpenSearch ==="
curl -s http://127.0.0.1:9200 | jq .
echo
echo "=== Count ==="
curl -s http://127.0.0.1:9200/at_web/_count | jq .
echo
echo "=== Segments ==="
ls -1 /opt/at-search/crawl/segments 2>/dev/null || true
EOF

chmod +x /usr/local/bin/at-search-crawl
chmod +x /usr/local/bin/at-search-reindex-last
chmod +x /usr/local/bin/at-search-query
chmod +x /usr/local/bin/at-search-status

echo "[12/12] Setze finale Rechte"
chown -R "${REAL_USER}:${REAL_GROUP}" "${APP_ROOT}"
chown -R 1000:1000 "${APP_ROOT}/opensearch/data"
chmod +x "${APP_ROOT}/nutch/bin/nutch" "${APP_ROOT}/nutch/bin/crawl"

cat <


curl http://127.0.0.1:9200
curl http://127.0.0.1:9200/at_web/_count | jq

at-search-status
at-search-crawl 2
at-search-status
at-search-query "wien"
at-search-reindex-last



cp /opt/at-search/seeds/seed.txt /opt/at-search/seeds/seed.txt.bak
cat > /opt/at-search/seeds/seed.txt <<'EOF'
https://www.univie.ac.at/
https://studieren.univie.ac.at/
https://www.tuwien.at/
https://www.tuwien.at/studium/
EOF
at-search-crawl 3
mv /opt/at-search/seeds/seed.txt.bak /opt/at-search/seeds/seed.txt


===v2===


#!/usr/bin/env bash
set -Eeuo pipefail

trap 'echo; echo "[FEHLER] setup.sh abgebrochen in Zeile $LINENO"; exit 1' ERR

if [[ "${EUID}" -ne 0 ]]; then
  echo "Bitte mit sudo/root ausführen."
  exit 1
fi

APP_ROOT="/opt/at-search"
NUTCH_VERSION="1.22"
OPENSEARCH_VERSION="1.3.20"
REAL_USER="${SUDO_USER:-root}"
REAL_GROUP="$(id -gn "$REAL_USER" 2>/dev/null || echo root)"

echo "[1/14] Installiere Pakete"
export DEBIAN_FRONTEND=noninteractive
apt-get update
apt-get install -y \
  curl \
  wget \
  jq \
  tar \
  gzip \
  unzip \
  ca-certificates \
  gnupg \
  lsb-release \
  software-properties-common \
  docker.io \
  docker-compose \
  openjdk-11-jdk-headless \
  python3 \
  python3-venv \
  python3-pip

systemctl enable docker
systemctl restart docker

echo "[2/14] Setze vm.max_map_count"
if grep -q '^vm.max_map_count=' /etc/sysctl.conf; then
  sed -i 's/^vm.max_map_count=.*/vm.max_map_count=262144/' /etc/sysctl.conf
else
  echo 'vm.max_map_count=262144' >> /etc/sysctl.conf
fi
sysctl -w vm.max_map_count=262144 >/dev/null

echo "[3/14] Setze JAVA_HOME"
JAVA_HOME_PATH="$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")"
cat > /etc/profile.d/at-search-java.sh < "${APP_ROOT}/opensearch/docker-compose.yml" </dev/null 2>&1; then
    ok=1
    break
  fi
  sleep 2
done

if [[ "${ok}" != "1" ]]; then
  echo "[FEHLER] OpenSearch kam nicht hoch:"
  docker-compose logs --tail=200 opensearch || true
  exit 1
fi

echo "[9/14] Lege Index at_web neu an"
curl -fsS -X DELETE "http://127.0.0.1:9200/at_web" >/dev/null 2>&1 || true
curl -fsS -X PUT "http://127.0.0.1:9200/at_web" \
  -H 'Content-Type: application/json' \
  -d '{
    "settings": {
      "number_of_shards": 1,
      "number_of_replicas": 0
    },
    "mappings": {
      "properties": {
        "url": {"type": "keyword"},
        "host": {"type": "keyword"},
        "site": {"type": "keyword"},
        "title": {"type": "text"},
        "content": {"type": "text"},
        "digest": {"type": "keyword"},
        "tstamp": {"type": "date", "format": "strict_date_optional_time||epoch_millis"},
        "url_depth": {"type": "integer"},
        "is_root": {"type": "boolean"},
        "content_length": {"type": "integer"},
        "domain_class": {"type": "keyword"},
        "host_rank": {"type": "float"}
      }
    }
  }' >/dev/null

echo "[10/14] Lade Apache Nutch"
cd "${APP_ROOT}/downloads"
if [[ ! -f "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" ]]; then
  wget -O "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" \
    "https://dlcdn.apache.org/nutch/${NUTCH_VERSION}/apache-nutch-${NUTCH_VERSION}-bin.tar.gz"
fi

rm -rf "${APP_ROOT}/apache-nutch-${NUTCH_VERSION}"
tar -xzf "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" -C "${APP_ROOT}"
ln -sfn "${APP_ROOT}/apache-nutch-${NUTCH_VERSION}" "${APP_ROOT}/nutch"

mkdir -p "${APP_ROOT}/nutch/logs" "${APP_ROOT}/crawl"

echo "[11/14] Konfiguriere Nutch + Hadoop"
cat > "${APP_ROOT}/nutch/conf/nutch-site.xml" <<'EOF'


  
    http.agent.name
    ATSearchBot
  

  
    http.robots.agents
    ATSearchBot,*
  

  
    plugin.includes
    protocol-httpclient|urlfilter-regex|parse-(html|tika)|index-(basic|anchor|more|metadata)|indexer-opensearch-1x|urlnormalizer-(pass|regex|basic)|scoring-opic
  

  
    fetcher.threads.fetch
    10
  

  
    http.content.limit
    1048576
  

  
    db.ignore.external.links
    false
  

  
    db.ignore.internal.links
    false
  

  
    db.ignore.external.links.mode
    byHost
  

  
    generate.max.per.host
    200
  

  
    generate.max.per.domain
    1000
  

  
    generate.count.mode
    byDomain
  

  
    fetcher.parse
    true
  

  
    parser.character.encoding.default
    UTF-8
  

EOF

cat > "${APP_ROOT}/nutch/conf/core-site.xml" <

  
    fs.defaultFS
    file:///
  
  
    hadoop.tmp.dir
    ${APP_ROOT}/hadoop-tmp
  

EOF

cat > "${APP_ROOT}/nutch/conf/mapred-site.xml" <

  
    mapreduce.framework.name
    local
  
  
    mapreduce.jobtracker.staging.root.dir
    ${APP_ROOT}/hadoop-staging
  
  
    mapreduce.job.reduces
    1
  

EOF

cat > "${APP_ROOT}/nutch/conf/regex-urlfilter.txt" <<'EOF'
+^https?://([a-z0-9-]+\.)*(at|co\.at|gv\.at|ac\.at)(/.*)?$
-^https?://.+\.(7z|avi|bin|css|doc|docx|gif|gz|ico|iso|jar|jpeg|jpg|js|mov|mp3|mp4|pdf|png|rar|svg|tar|tgz|webp|woff|woff2|xls|xlsx|zip)(\?.*)?$
-^https?://.*(\?|&)(utm_|fbclid=|gclid=|reply=|share=|sort=|filter=|session=).*$
-^https?://.*/(tag|tags|category|categories|thema|themen|archiv|archive|suche|search|index|author|feed|rss)(/.*)?$
-^https?://.*/(page|seite)/[0-9]+/?$
-.
EOF

cat > "${APP_ROOT}/nutch/conf/index-writers.xml" <<'EOF'



  
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    

    
      
        
      
      
      
        
      
    
  


EOF

cat > "${APP_ROOT}/seeds/seed.txt" <<'EOF'
https://www.derstandard.at/
https://orf.at/
https://www.parlament.gv.at/
https://www.tuwien.at/
https://www.univie.ac.at/
https://www.wko.at/
https://www.arbeiterkammer.at/
https://www.ams.at/
https://www.wien.gv.at/
https://www.oesterreich.gv.at/
EOF

echo "[12/14] Schreibe Enrichment-Script"
cat > "${APP_ROOT}/enrich/enrich_index.py" <<'EOF'
#!/usr/bin/env python3
import json
from urllib.parse import urlparse
import requests

OS_URL = "http://127.0.0.1:9200"
INDEX = "at_web"
BATCH = 200

def domain_class(host: str) -> str:
    host = (host or "").lower()
    if host.endswith(".gv.at") or host == "www.parlament.gv.at":
        return "gov"
    if host.endswith(".ac.at"):
        return "edu"
    if "orf.at" in host or "derstandard.at" in host:
        return "news"
    if "wko.at" in host:
        return "commerce"
    return "general"

def host_rank(host: str) -> float:
    host = (host or "").lower()
    if host.endswith(".gv.at") or host == "www.parlament.gv.at":
        return 3.0
    if host.endswith(".ac.at"):
        return 2.5
    if host == "www.orf.at":
        return 2.0
    if host == "www.derstandard.at":
        return 1.8
    if host == "www.wko.at":
        return 1.4
    return 1.0

def compute_fields(src: dict) -> dict:
    url = src.get("url", "") or ""
    content = src.get("content", "") or ""
    parsed = urlparse(url)
    path = parsed.path or "/"
    stripped = path.strip("/")
    depth = 0 if stripped == "" else len([p for p in stripped.split("/") if p])
    is_root = stripped == ""
    return {
        "url_depth": depth,
        "is_root": is_root,
        "content_length": len(content),
        "domain_class": domain_class(src.get("host", "")),
        "host_rank": host_rank(src.get("host", "")),
    }

def main():
    session = requests.Session()
    search_after = None
    total = 0

    while True:
        body = {
            "size": BATCH,
            "sort": [{"_id": "asc"}],
            "_source": ["url", "host", "content"],
            "query": {"match_all": {}}
        }
        if search_after is not None:
            body["search_after"] = search_after

        r = session.post(f"{OS_URL}/{INDEX}/_search", json=body, timeout=30)
        r.raise_for_status()
        data = r.json()
        hits = data.get("hits", {}).get("hits", [])
        if not hits:
            break

        lines = []
        for hit in hits:
            doc_id = hit["_id"]
            src = hit.get("_source", {})
            fields = compute_fields(src)
            meta = {"update": {"_index": INDEX, "_id": doc_id}}
            doc = {"doc": fields}
            lines.append(json.dumps(meta))
            lines.append(json.dumps(doc))
            total += 1

        payload = "\n".join(lines) + "\n"
        br = session.post(
            f"{OS_URL}/_bulk",
            data=payload,
            headers={"Content-Type": "application/x-ndjson"},
            timeout=60,
        )
        br.raise_for_status()

        search_after = hits[-1]["sort"]

    print(f"Enriched documents: {total}")

if __name__ == "__main__":
    main()
EOF
chmod +x "${APP_ROOT}/enrich/enrich_index.py"

echo "[13/14] Schreibe Wrapper"
cat > /usr/local/bin/at-search-crawl <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
export JAVA_HOME="${JAVA_HOME:-$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")}"
export NUTCH_HOME="/opt/at-search/nutch"
export NUTCH_CONF_DIR="${NUTCH_HOME}/conf"
export HADOOP_CONF_DIR="${NUTCH_HOME}/conf"
export HADOOP_OPTS="-Djava.io.tmpdir=/opt/at-search/hadoop-tmp"

ROUNDS="${1:-5}"

mkdir -p /opt/at-search/crawl /opt/at-search/nutch/logs /opt/at-search/hadoop-tmp /opt/at-search/hadoop-staging

"${NUTCH_HOME}/bin/crawl" -i -s /opt/at-search/seeds /opt/at-search/crawl "${ROUNDS}"
/opt/at-search/venv/bin/python /opt/at-search/enrich/enrich_index.py
EOF

cat > /usr/local/bin/at-search-reindex-last <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
export JAVA_HOME="${JAVA_HOME:-$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")}"
export NUTCH_HOME="/opt/at-search/nutch"
export NUTCH_CONF_DIR="${NUTCH_HOME}/conf"
export HADOOP_CONF_DIR="${NUTCH_HOME}/conf"
export HADOOP_OPTS="-Djava.io.tmpdir=/opt/at-search/hadoop-tmp"

SEG="$(ls -1 /opt/at-search/crawl/segments | sort | tail -n1)"
"${NUTCH_HOME}/bin/nutch" index /opt/at-search/crawl/crawldb -linkdb /opt/at-search/crawl/linkdb "/opt/at-search/crawl/segments/${SEG}" -deleteGone
/opt/at-search/venv/bin/python /opt/at-search/enrich/enrich_index.py
EOF

cat > /usr/local/bin/at-search-enrich <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
exec /opt/at-search/venv/bin/python /opt/at-search/enrich/enrich_index.py
EOF

cat > /usr/local/bin/at-search-reset-index <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
curl -s -X DELETE http://127.0.0.1:9200/at_web >/dev/null 2>&1 || true
curl -s -X PUT http://127.0.0.1:9200/at_web -H 'Content-Type: application/json' -d '{
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 0
  },
  "mappings": {
    "properties": {
      "url": {"type": "keyword"},
      "host": {"type": "keyword"},
      "site": {"type": "keyword"},
      "title": {"type": "text"},
      "content": {"type": "text"},
      "digest": {"type": "keyword"},
      "tstamp": {"type": "date", "format": "strict_date_optional_time||epoch_millis"},
      "url_depth": {"type": "integer"},
      "is_root": {"type": "boolean"},
      "content_length": {"type": "integer"},
      "domain_class": {"type": "keyword"},
      "host_rank": {"type": "float"}
    }
  }
}' | jq .
EOF

cat > /usr/local/bin/at-search-query <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
if [[ $# -lt 1 ]]; then
  echo 'Nutzung: at-search-query "suchbegriff"'
  exit 1
fi
QUERY="$*"
BODY="$(jq -n --arg q "$QUERY" '{
  size: 10,
  query: {
    function_score: {
      query: {
        bool: {
          must: [
            {
              multi_match: {
                query: $q,
                fields: ["title^8","content^2","site","host"],
                type: "best_fields"
              }
            }
          ],
          must_not: [
            { term: { is_root: true } },
            { regexp: { url: ".*/(tag|tags|category|categories|thema|themen|archiv|archive|suche|search|index|author|feed|rss)(/.*)?" } },
            { regexp: { url: ".*/(page|seite)/[0-9]+/?" } }
          ],
          should: [
            { match_phrase: { title: { query: $q, boost: 10 } } }
          ]
        }
      },
      functions: [
        { filter: { term: { domain_class: "gov" } }, weight: 3.0 },
        { filter: { term: { domain_class: "edu" } }, weight: 2.5 },
        { filter: { term: { domain_class: "news" } }, weight: 1.5 },
        { field_value_factor: { field: "host_rank", factor: 1.0, missing: 1.0 } },
        { field_value_factor: { field: "url_depth", factor: 0.25, missing: 0 } },
        { field_value_factor: { field: "content_length", factor: 0.0005, modifier: "log1p", missing: 1 } }
      ],
      score_mode: "sum",
      boost_mode: "sum"
    }
  },
  _source: ["title","url","host","site","url_depth","is_root","domain_class","host_rank"]
}')"
curl -fsS "http://127.0.0.1:9200/at_web/_search" \
  -H 'Content-Type: application/json' \
  -d "${BODY}" | jq .
EOF

cat > /usr/local/bin/at-search-status <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
echo "=== OpenSearch ==="
curl -s http://127.0.0.1:9200 | jq .
echo
echo "=== Count ==="
curl -s http://127.0.0.1:9200/at_web/_count | jq .
echo
echo "=== Segments ==="
ls -1 /opt/at-search/crawl/segments 2>/dev/null || true
EOF

chmod +x /usr/local/bin/at-search-crawl
chmod +x /usr/local/bin/at-search-reindex-last
chmod +x /usr/local/bin/at-search-query
chmod +x /usr/local/bin/at-search-status
chmod +x /usr/local/bin/at-search-enrich
chmod +x /usr/local/bin/at-search-reset-index

echo "[14/14] Setze finale Rechte"
chown -R "${REAL_USER}:${REAL_GROUP}" "${APP_ROOT}"
chown -R 1000:1000 "${APP_ROOT}/opensearch/data"
chmod +x "${APP_ROOT}/nutch/bin/nutch" "${APP_ROOT}/nutch/bin/crawl"

cat <