Siehe auch [[ElasticSearch]] (z.B Scripte, API, etc. sind identisch). [[Wazuh]] verwendet [[OpenSearch]].
=====Installation=====
====Demo====
Run a local cluster
docker run --rm -p 9200:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:2.6.0
Create a python script
from opensearchpy import OpenSearch
client = OpenSearch(
hosts = [{"host": "localhost", "port": 9200}],
http_auth = ("admin", "admin"),
use_ssl = True,
verify_certs = False,
ssl_assert_hostname = False,
ssl_show_warn = False,
)
client.info()
Get some random data for e.g [[https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots|wikipedia-movie-plots]]. Read the data into a pandas array.
import pandas as pd
df = (
pd.read_csv("wiki_movie_plots_deduped.csv")
.dropna()
.sample(5000, random_state=42)
.reset_index(drop=True)
)
Create an index
body = {
"mappings":{
"properties": {
"title": {"type": "text", "analyzer": "english"},
"ethnicity": {"type": "text", "analyzer": "standard"},
"director": {"type": "text", "analyzer": "standard"},
"cast": {"type": "text", "analyzer": "standard"},
"genre": {"type": "text", "analyzer": "standard"},
"plot": {"type": "text", "analyzer": "english"},
"year": {"type": "integer"},
"wiki_page": {"type": "keyword"}
}
}
}
response = client.indices.create("movies", body=body)
Push the data into the index
for i, row in df.iterrows():
body = {
"title": row["Title"],
"ethnicity": row["Origin/Ethnicity"],
"director": row["Director"],
"cast": row["Cast"],
"genre": row["Genre"],
"plot": row["Plot"],
"year": row["Release Year"],
"wiki_page": row["Wiki Page"]
}
client.index(index="movies", id=i, body=body)
More data in a bulk
from opensearchpy.helpers import bulk
bulk_data = []
for i,row in df.iterrows():
bulk_data.append(
{
"_index": "movies",
"_id": i,
"_source": {
"title": row["Title"],
"ethnicity": row["Origin/Ethnicity"],
"director": row["Director"],
"cast": row["Cast"],
"genre": row["Genre"],
"plot": row["Plot"],
"year": row["Release Year"],
"wiki_page": row["Wiki Page"],
}
}
)
bulk(client, bulk_data)
Count the inserted data
client.indices.refresh(index="movies")
client.cat.count(index="movies", format="json")
Search the data
resp = client.search(
index="movies",
body={
"query": {
"bool": {
"must": {
"match_phrase": {
"cast": "jack nicholson",
}
},
"filter": {"bool": {"must_not": {"match_phrase": {"director": "tim burton"}}}},
},
},
}
)
resp
Remove documents
client.delete(index="movies", id="2500")
Delete the index
client.indices.delete(index='movies')
====Production====
===v1===
Run as root
#!/usr/bin/env bash
set -Eeuo pipefail
trap 'echo; echo "[FEHLER] setup.sh abgebrochen in Zeile $LINENO"; exit 1' ERR
if [[ "${EUID}" -ne 0 ]]; then
echo "Bitte mit sudo/root ausführen."
exit 1
fi
APP_ROOT="/opt/at-search"
NUTCH_VERSION="1.22"
OPENSEARCH_VERSION="1.3.20"
REAL_USER="${SUDO_USER:-root}"
REAL_GROUP="$(id -gn "$REAL_USER")"
echo "[1/12] Installiere Pakete"
export DEBIAN_FRONTEND=noninteractive
apt-get update
apt-get install -y \
curl \
wget \
jq \
tar \
gzip \
unzip \
ca-certificates \
gnupg \
lsb-release \
software-properties-common \
docker.io \
docker-compose \
openjdk-11-jdk-headless
systemctl enable docker
systemctl restart docker
echo "[2/12] Setze vm.max_map_count"
if grep -q '^vm.max_map_count=' /etc/sysctl.conf; then
sed -i 's/^vm.max_map_count=.*/vm.max_map_count=262144/' /etc/sysctl.conf
else
echo 'vm.max_map_count=262144' >> /etc/sysctl.conf
fi
sysctl -w vm.max_map_count=262144 >/dev/null
echo "[3/12] Setze JAVA_HOME"
JAVA_HOME_PATH="$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")"
cat > /etc/profile.d/at-search-java.sh < "${APP_ROOT}/opensearch/docker-compose.yml" </dev/null 2>&1; then
ok=1
break
fi
sleep 2
done
if [[ "${ok}" != "1" ]]; then
echo "[FEHLER] OpenSearch kam nicht hoch:"
docker-compose logs --tail=200 opensearch || true
exit 1
fi
echo "[8/12] Lege Index at_web neu an"
curl -fsS -X DELETE "http://127.0.0.1:9200/at_web" >/dev/null 2>&1 || true
curl -fsS -X PUT "http://127.0.0.1:9200/at_web" \
-H 'Content-Type: application/json' \
-d '{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
},
"mappings": {
"properties": {
"url": {"type": "keyword"},
"host": {"type": "keyword"},
"site": {"type": "keyword"},
"title": {"type": "text"},
"content": {"type": "text"},
"digest": {"type": "keyword"},
"tstamp": {"type": "date", "format": "strict_date_optional_time||epoch_millis"}
}
}
}' >/dev/null
echo "[9/12] Lade Apache Nutch"
cd "${APP_ROOT}/downloads"
if [[ ! -f "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" ]]; then
wget -O "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" \
"https://dlcdn.apache.org/nutch/${NUTCH_VERSION}/apache-nutch-${NUTCH_VERSION}-bin.tar.gz"
fi
rm -rf "${APP_ROOT}/apache-nutch-${NUTCH_VERSION}"
tar -xzf "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" -C "${APP_ROOT}"
ln -sfn "${APP_ROOT}/apache-nutch-${NUTCH_VERSION}" "${APP_ROOT}/nutch"
mkdir -p "${APP_ROOT}/nutch/logs" "${APP_ROOT}/crawl"
echo "[10/12] Konfiguriere Nutch + Hadoop local paths"
cat > "${APP_ROOT}/nutch/conf/nutch-site.xml" <<'EOF'
http.agent.name
ATSearchBot
http.robots.agents
ATSearchBot,*
plugin.includes
protocol-httpclient|urlfilter-regex|parse-(html|tika)|index-(basic|anchor|more|metadata)|indexer-opensearch-1x|urlnormalizer-(pass|regex|basic)|scoring-opic
fetcher.threads.fetch
10
http.content.limit
1048576
db.ignore.external.links
false
db.ignore.internal.links
false
db.ignore.external.links.mode
byHost
generate.max.per.host
25
generate.max.per.domain
100
generate.count.mode
byDomain
fetcher.parse
true
parser.character.encoding.default
UTF-8
EOF
cat > "${APP_ROOT}/nutch/conf/core-site.xml" <
fs.defaultFS
file:///
hadoop.tmp.dir
${APP_ROOT}/hadoop-tmp
EOF
cat > "${APP_ROOT}/nutch/conf/mapred-site.xml" <
mapreduce.framework.name
local
mapreduce.jobtracker.staging.root.dir
${APP_ROOT}/hadoop-staging
mapreduce.job.reduces
1
EOF
cat > "${APP_ROOT}/nutch/conf/regex-urlfilter.txt" <<'EOF'
+^https?://([a-z0-9-]+\.)*(at|co\.at|gv\.at|ac\.at)(/.*)?$
-^https?://.+\.(7z|avi|bin|css|doc|docx|gif|gz|ico|iso|jar|jpeg|jpg|js|mov|mp3|mp4|pdf|png|rar|svg|tar|tgz|webp|woff|woff2|xls|xlsx|zip)(\?.*)?$
-^https?://.*(\?|&)(utm_|fbclid=|gclid=|reply=|share=|sort=|filter=|session=).*$
-.
EOF
cat > "${APP_ROOT}/nutch/conf/index-writers.xml" <<'EOF'
EOF
cat > "${APP_ROOT}/seeds/seed.txt" <<'EOF'
https://www.oesterreich.gv.at/
https://www.parlament.gv.at/
https://www.orf.at/
https://www.derstandard.at/
https://www.wien.gv.at/
https://www.arbeiterkammer.at/
https://www.wko.at/
https://www.ams.at/
https://www.univie.ac.at/
https://www.tuwien.at/
EOF
echo "[11/12] Schreibe Wrapper"
cat > /usr/local/bin/at-search-crawl <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
export JAVA_HOME="${JAVA_HOME:-$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")}"
export NUTCH_HOME="/opt/at-search/nutch"
export NUTCH_CONF_DIR="${NUTCH_HOME}/conf"
export HADOOP_CONF_DIR="${NUTCH_HOME}/conf"
export HADOOP_OPTS="-Djava.io.tmpdir=/opt/at-search/hadoop-tmp"
mkdir -p /opt/at-search/crawl /opt/at-search/nutch/logs /opt/at-search/hadoop-tmp /opt/at-search/hadoop-staging
exec "${NUTCH_HOME}/bin/crawl" -i -s /opt/at-search/seeds /opt/at-search/crawl 2
EOF
cat > /usr/local/bin/at-search-reindex-last <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
export JAVA_HOME="${JAVA_HOME:-$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")}"
export NUTCH_HOME="/opt/at-search/nutch"
export NUTCH_CONF_DIR="${NUTCH_HOME}/conf"
export HADOOP_CONF_DIR="${NUTCH_HOME}/conf"
export HADOOP_OPTS="-Djava.io.tmpdir=/opt/at-search/hadoop-tmp"
SEG="$(ls -1 /opt/at-search/crawl/segments | sort | tail -n1)"
exec "${NUTCH_HOME}/bin/nutch" index /opt/at-search/crawl/crawldb -linkdb /opt/at-search/crawl/linkdb "/opt/at-search/crawl/segments/${SEG}" -deleteGone
EOF
cat > /usr/local/bin/at-search-query <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
if [[ $# -lt 1 ]]; then
echo 'Nutzung: at-search-query "suchbegriff"'
exit 1
fi
QUERY="$*"
BODY="$(jq -n --arg q "$QUERY" '{
size: 10,
query: {
multi_match: {
query: $q,
fields: ["title^3","content","site^2","host"]
}
},
_source: ["title","url","host","site"]
}')"
curl -fsS "http://127.0.0.1:9200/at_web/_search" \
-H 'Content-Type: application/json' \
-d "${BODY}" | jq .
EOF
cat > /usr/local/bin/at-search-status <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
echo "=== OpenSearch ==="
curl -s http://127.0.0.1:9200 | jq .
echo
echo "=== Count ==="
curl -s http://127.0.0.1:9200/at_web/_count | jq .
echo
echo "=== Segments ==="
ls -1 /opt/at-search/crawl/segments 2>/dev/null || true
EOF
chmod +x /usr/local/bin/at-search-crawl
chmod +x /usr/local/bin/at-search-reindex-last
chmod +x /usr/local/bin/at-search-query
chmod +x /usr/local/bin/at-search-status
echo "[12/12] Setze finale Rechte"
chown -R "${REAL_USER}:${REAL_GROUP}" "${APP_ROOT}"
chown -R 1000:1000 "${APP_ROOT}/opensearch/data"
chmod +x "${APP_ROOT}/nutch/bin/nutch" "${APP_ROOT}/nutch/bin/crawl"
cat <
curl http://127.0.0.1:9200
curl http://127.0.0.1:9200/at_web/_count | jq
at-search-status
at-search-crawl 2
at-search-status
at-search-query "wien"
at-search-reindex-last
cp /opt/at-search/seeds/seed.txt /opt/at-search/seeds/seed.txt.bak
cat > /opt/at-search/seeds/seed.txt <<'EOF'
https://www.univie.ac.at/
https://studieren.univie.ac.at/
https://www.tuwien.at/
https://www.tuwien.at/studium/
EOF
at-search-crawl 3
mv /opt/at-search/seeds/seed.txt.bak /opt/at-search/seeds/seed.txt
===v2===
#!/usr/bin/env bash
set -Eeuo pipefail
trap 'echo; echo "[FEHLER] setup.sh abgebrochen in Zeile $LINENO"; exit 1' ERR
if [[ "${EUID}" -ne 0 ]]; then
echo "Bitte mit sudo/root ausführen."
exit 1
fi
APP_ROOT="/opt/at-search"
NUTCH_VERSION="1.22"
OPENSEARCH_VERSION="1.3.20"
REAL_USER="${SUDO_USER:-root}"
REAL_GROUP="$(id -gn "$REAL_USER" 2>/dev/null || echo root)"
echo "[1/14] Installiere Pakete"
export DEBIAN_FRONTEND=noninteractive
apt-get update
apt-get install -y \
curl \
wget \
jq \
tar \
gzip \
unzip \
ca-certificates \
gnupg \
lsb-release \
software-properties-common \
docker.io \
docker-compose \
openjdk-11-jdk-headless \
python3 \
python3-venv \
python3-pip
systemctl enable docker
systemctl restart docker
echo "[2/14] Setze vm.max_map_count"
if grep -q '^vm.max_map_count=' /etc/sysctl.conf; then
sed -i 's/^vm.max_map_count=.*/vm.max_map_count=262144/' /etc/sysctl.conf
else
echo 'vm.max_map_count=262144' >> /etc/sysctl.conf
fi
sysctl -w vm.max_map_count=262144 >/dev/null
echo "[3/14] Setze JAVA_HOME"
JAVA_HOME_PATH="$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")"
cat > /etc/profile.d/at-search-java.sh < "${APP_ROOT}/opensearch/docker-compose.yml" </dev/null 2>&1; then
ok=1
break
fi
sleep 2
done
if [[ "${ok}" != "1" ]]; then
echo "[FEHLER] OpenSearch kam nicht hoch:"
docker-compose logs --tail=200 opensearch || true
exit 1
fi
echo "[9/14] Lege Index at_web neu an"
curl -fsS -X DELETE "http://127.0.0.1:9200/at_web" >/dev/null 2>&1 || true
curl -fsS -X PUT "http://127.0.0.1:9200/at_web" \
-H 'Content-Type: application/json' \
-d '{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
},
"mappings": {
"properties": {
"url": {"type": "keyword"},
"host": {"type": "keyword"},
"site": {"type": "keyword"},
"title": {"type": "text"},
"content": {"type": "text"},
"digest": {"type": "keyword"},
"tstamp": {"type": "date", "format": "strict_date_optional_time||epoch_millis"},
"url_depth": {"type": "integer"},
"is_root": {"type": "boolean"},
"content_length": {"type": "integer"},
"domain_class": {"type": "keyword"},
"host_rank": {"type": "float"}
}
}
}' >/dev/null
echo "[10/14] Lade Apache Nutch"
cd "${APP_ROOT}/downloads"
if [[ ! -f "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" ]]; then
wget -O "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" \
"https://dlcdn.apache.org/nutch/${NUTCH_VERSION}/apache-nutch-${NUTCH_VERSION}-bin.tar.gz"
fi
rm -rf "${APP_ROOT}/apache-nutch-${NUTCH_VERSION}"
tar -xzf "apache-nutch-${NUTCH_VERSION}-bin.tar.gz" -C "${APP_ROOT}"
ln -sfn "${APP_ROOT}/apache-nutch-${NUTCH_VERSION}" "${APP_ROOT}/nutch"
mkdir -p "${APP_ROOT}/nutch/logs" "${APP_ROOT}/crawl"
echo "[11/14] Konfiguriere Nutch + Hadoop"
cat > "${APP_ROOT}/nutch/conf/nutch-site.xml" <<'EOF'
http.agent.name
ATSearchBot
http.robots.agents
ATSearchBot,*
plugin.includes
protocol-httpclient|urlfilter-regex|parse-(html|tika)|index-(basic|anchor|more|metadata)|indexer-opensearch-1x|urlnormalizer-(pass|regex|basic)|scoring-opic
fetcher.threads.fetch
10
http.content.limit
1048576
db.ignore.external.links
false
db.ignore.internal.links
false
db.ignore.external.links.mode
byHost
generate.max.per.host
200
generate.max.per.domain
1000
generate.count.mode
byDomain
fetcher.parse
true
parser.character.encoding.default
UTF-8
EOF
cat > "${APP_ROOT}/nutch/conf/core-site.xml" <
fs.defaultFS
file:///
hadoop.tmp.dir
${APP_ROOT}/hadoop-tmp
EOF
cat > "${APP_ROOT}/nutch/conf/mapred-site.xml" <
mapreduce.framework.name
local
mapreduce.jobtracker.staging.root.dir
${APP_ROOT}/hadoop-staging
mapreduce.job.reduces
1
EOF
cat > "${APP_ROOT}/nutch/conf/regex-urlfilter.txt" <<'EOF'
+^https?://([a-z0-9-]+\.)*(at|co\.at|gv\.at|ac\.at)(/.*)?$
-^https?://.+\.(7z|avi|bin|css|doc|docx|gif|gz|ico|iso|jar|jpeg|jpg|js|mov|mp3|mp4|pdf|png|rar|svg|tar|tgz|webp|woff|woff2|xls|xlsx|zip)(\?.*)?$
-^https?://.*(\?|&)(utm_|fbclid=|gclid=|reply=|share=|sort=|filter=|session=).*$
-^https?://.*/(tag|tags|category|categories|thema|themen|archiv|archive|suche|search|index|author|feed|rss)(/.*)?$
-^https?://.*/(page|seite)/[0-9]+/?$
-.
EOF
cat > "${APP_ROOT}/nutch/conf/index-writers.xml" <<'EOF'
EOF
cat > "${APP_ROOT}/seeds/seed.txt" <<'EOF'
https://www.derstandard.at/
https://orf.at/
https://www.parlament.gv.at/
https://www.tuwien.at/
https://www.univie.ac.at/
https://www.wko.at/
https://www.arbeiterkammer.at/
https://www.ams.at/
https://www.wien.gv.at/
https://www.oesterreich.gv.at/
EOF
echo "[12/14] Schreibe Enrichment-Script"
cat > "${APP_ROOT}/enrich/enrich_index.py" <<'EOF'
#!/usr/bin/env python3
import json
from urllib.parse import urlparse
import requests
OS_URL = "http://127.0.0.1:9200"
INDEX = "at_web"
BATCH = 200
def domain_class(host: str) -> str:
host = (host or "").lower()
if host.endswith(".gv.at") or host == "www.parlament.gv.at":
return "gov"
if host.endswith(".ac.at"):
return "edu"
if "orf.at" in host or "derstandard.at" in host:
return "news"
if "wko.at" in host:
return "commerce"
return "general"
def host_rank(host: str) -> float:
host = (host or "").lower()
if host.endswith(".gv.at") or host == "www.parlament.gv.at":
return 3.0
if host.endswith(".ac.at"):
return 2.5
if host == "www.orf.at":
return 2.0
if host == "www.derstandard.at":
return 1.8
if host == "www.wko.at":
return 1.4
return 1.0
def compute_fields(src: dict) -> dict:
url = src.get("url", "") or ""
content = src.get("content", "") or ""
parsed = urlparse(url)
path = parsed.path or "/"
stripped = path.strip("/")
depth = 0 if stripped == "" else len([p for p in stripped.split("/") if p])
is_root = stripped == ""
return {
"url_depth": depth,
"is_root": is_root,
"content_length": len(content),
"domain_class": domain_class(src.get("host", "")),
"host_rank": host_rank(src.get("host", "")),
}
def main():
session = requests.Session()
search_after = None
total = 0
while True:
body = {
"size": BATCH,
"sort": [{"_id": "asc"}],
"_source": ["url", "host", "content"],
"query": {"match_all": {}}
}
if search_after is not None:
body["search_after"] = search_after
r = session.post(f"{OS_URL}/{INDEX}/_search", json=body, timeout=30)
r.raise_for_status()
data = r.json()
hits = data.get("hits", {}).get("hits", [])
if not hits:
break
lines = []
for hit in hits:
doc_id = hit["_id"]
src = hit.get("_source", {})
fields = compute_fields(src)
meta = {"update": {"_index": INDEX, "_id": doc_id}}
doc = {"doc": fields}
lines.append(json.dumps(meta))
lines.append(json.dumps(doc))
total += 1
payload = "\n".join(lines) + "\n"
br = session.post(
f"{OS_URL}/_bulk",
data=payload,
headers={"Content-Type": "application/x-ndjson"},
timeout=60,
)
br.raise_for_status()
search_after = hits[-1]["sort"]
print(f"Enriched documents: {total}")
if __name__ == "__main__":
main()
EOF
chmod +x "${APP_ROOT}/enrich/enrich_index.py"
echo "[13/14] Schreibe Wrapper"
cat > /usr/local/bin/at-search-crawl <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
export JAVA_HOME="${JAVA_HOME:-$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")}"
export NUTCH_HOME="/opt/at-search/nutch"
export NUTCH_CONF_DIR="${NUTCH_HOME}/conf"
export HADOOP_CONF_DIR="${NUTCH_HOME}/conf"
export HADOOP_OPTS="-Djava.io.tmpdir=/opt/at-search/hadoop-tmp"
ROUNDS="${1:-5}"
mkdir -p /opt/at-search/crawl /opt/at-search/nutch/logs /opt/at-search/hadoop-tmp /opt/at-search/hadoop-staging
"${NUTCH_HOME}/bin/crawl" -i -s /opt/at-search/seeds /opt/at-search/crawl "${ROUNDS}"
/opt/at-search/venv/bin/python /opt/at-search/enrich/enrich_index.py
EOF
cat > /usr/local/bin/at-search-reindex-last <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
export JAVA_HOME="${JAVA_HOME:-$(dirname "$(dirname "$(readlink -f /usr/bin/java)")")}"
export NUTCH_HOME="/opt/at-search/nutch"
export NUTCH_CONF_DIR="${NUTCH_HOME}/conf"
export HADOOP_CONF_DIR="${NUTCH_HOME}/conf"
export HADOOP_OPTS="-Djava.io.tmpdir=/opt/at-search/hadoop-tmp"
SEG="$(ls -1 /opt/at-search/crawl/segments | sort | tail -n1)"
"${NUTCH_HOME}/bin/nutch" index /opt/at-search/crawl/crawldb -linkdb /opt/at-search/crawl/linkdb "/opt/at-search/crawl/segments/${SEG}" -deleteGone
/opt/at-search/venv/bin/python /opt/at-search/enrich/enrich_index.py
EOF
cat > /usr/local/bin/at-search-enrich <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
exec /opt/at-search/venv/bin/python /opt/at-search/enrich/enrich_index.py
EOF
cat > /usr/local/bin/at-search-reset-index <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
curl -s -X DELETE http://127.0.0.1:9200/at_web >/dev/null 2>&1 || true
curl -s -X PUT http://127.0.0.1:9200/at_web -H 'Content-Type: application/json' -d '{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
},
"mappings": {
"properties": {
"url": {"type": "keyword"},
"host": {"type": "keyword"},
"site": {"type": "keyword"},
"title": {"type": "text"},
"content": {"type": "text"},
"digest": {"type": "keyword"},
"tstamp": {"type": "date", "format": "strict_date_optional_time||epoch_millis"},
"url_depth": {"type": "integer"},
"is_root": {"type": "boolean"},
"content_length": {"type": "integer"},
"domain_class": {"type": "keyword"},
"host_rank": {"type": "float"}
}
}
}' | jq .
EOF
cat > /usr/local/bin/at-search-query <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
if [[ $# -lt 1 ]]; then
echo 'Nutzung: at-search-query "suchbegriff"'
exit 1
fi
QUERY="$*"
BODY="$(jq -n --arg q "$QUERY" '{
size: 10,
query: {
function_score: {
query: {
bool: {
must: [
{
multi_match: {
query: $q,
fields: ["title^8","content^2","site","host"],
type: "best_fields"
}
}
],
must_not: [
{ term: { is_root: true } },
{ regexp: { url: ".*/(tag|tags|category|categories|thema|themen|archiv|archive|suche|search|index|author|feed|rss)(/.*)?" } },
{ regexp: { url: ".*/(page|seite)/[0-9]+/?" } }
],
should: [
{ match_phrase: { title: { query: $q, boost: 10 } } }
]
}
},
functions: [
{ filter: { term: { domain_class: "gov" } }, weight: 3.0 },
{ filter: { term: { domain_class: "edu" } }, weight: 2.5 },
{ filter: { term: { domain_class: "news" } }, weight: 1.5 },
{ field_value_factor: { field: "host_rank", factor: 1.0, missing: 1.0 } },
{ field_value_factor: { field: "url_depth", factor: 0.25, missing: 0 } },
{ field_value_factor: { field: "content_length", factor: 0.0005, modifier: "log1p", missing: 1 } }
],
score_mode: "sum",
boost_mode: "sum"
}
},
_source: ["title","url","host","site","url_depth","is_root","domain_class","host_rank"]
}')"
curl -fsS "http://127.0.0.1:9200/at_web/_search" \
-H 'Content-Type: application/json' \
-d "${BODY}" | jq .
EOF
cat > /usr/local/bin/at-search-status <<'EOF'
#!/usr/bin/env bash
set -Eeuo pipefail
echo "=== OpenSearch ==="
curl -s http://127.0.0.1:9200 | jq .
echo
echo "=== Count ==="
curl -s http://127.0.0.1:9200/at_web/_count | jq .
echo
echo "=== Segments ==="
ls -1 /opt/at-search/crawl/segments 2>/dev/null || true
EOF
chmod +x /usr/local/bin/at-search-crawl
chmod +x /usr/local/bin/at-search-reindex-last
chmod +x /usr/local/bin/at-search-query
chmod +x /usr/local/bin/at-search-status
chmod +x /usr/local/bin/at-search-enrich
chmod +x /usr/local/bin/at-search-reset-index
echo "[14/14] Setze finale Rechte"
chown -R "${REAL_USER}:${REAL_GROUP}" "${APP_ROOT}"
chown -R 1000:1000 "${APP_ROOT}/opensearch/data"
chmod +x "${APP_ROOT}/nutch/bin/nutch" "${APP_ROOT}/nutch/bin/crawl"
cat <